diff --git a/CMakeLists.txt b/CMakeLists.txt index c96639f336..e9111cac6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -317,6 +317,8 @@ set(SRC_FILES src/include/git_version.h src/include/graph.h src/include/group.h + src/include/ibvcore.h + src/include/ibvsymbols.h src/include/ibvwrap.h src/include/info.h src/include/ipcsocket.h @@ -379,6 +381,7 @@ set(SRC_FILES src/misc/argcheck.cc # src/misc/cudawrap.cc # src/misc/gdrwrap.cc + src/misc/ibvsymbols.cc src/misc/ibvwrap.cc src/misc/ipcsocket.cc src/misc/msccl/msccl_lifecycle.cc diff --git a/makefiles/common.mk b/makefiles/common.mk index 35d1826e3f..60a019c0b2 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -12,6 +12,7 @@ DEBUG ?= 0 TRACE ?= 0 PROFAPI ?= 1 NVTX ?= 1 +RDMA_CORE ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc @@ -106,3 +107,7 @@ endif ifneq ($(PROFAPI), 0) CXXFLAGS += -DPROFAPI endif + +ifneq ($(RDMA_CORE), 0) +CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 +endif diff --git a/makefiles/version.mk b/makefiles/version.mk index 6877b63a09..ba162237d4 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 17 -NCCL_PATCH := 1 +NCCL_MINOR := 18 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index ca5ddce466..dd5754989e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \ - misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \ + misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \ misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ misc/ipcsocket.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \ diff --git a/src/bootstrap.cc b/src/bootstrap.cc index e542e26c87..fdbb7d04c8 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -222,7 +222,6 @@ struct bootstrapState { int cudaDev; int rank; int nranks; - int virtualId; uint64_t magic; volatile uint32_t *abortFlag; }; @@ -230,7 +229,6 @@ struct bootstrapState { ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) { int rank = comm->rank; int nranks = comm->nRanks; - int virtualId = comm->virtualId; struct bootstrapState* state; struct ncclSocket* proxySocket; ncclSocketAddress nextAddr; @@ -241,11 +239,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* state->rank = rank; state->nranks = nranks; state->abortFlag = comm->abortFlag; - state->virtualId = virtualId; comm->bootstrap = state; comm->magic = state->magic = handle->magic; - TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId); + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); // [RCCL] Register custom signal handlers if requested RegisterSignalHandlers(); @@ -308,11 +305,79 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses)); - TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId); + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } +ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { + ncclResult_t ret = ncclSuccess; + int rank = comm->rank; + int nranks = comm->nRanks; + int prev, next; + ncclSocketAddress listenAddr, tmpAddr; + struct ncclSocket* proxySocket; + struct bootstrapState* state; + + NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail); + state->rank = rank; + state->nranks = nranks; + state->abortFlag = comm->abortFlag; + comm->bootstrap = state; + comm->magic = state->magic = handle->magic; + + prev = parentRanks[(rank-1+nranks)%nranks]; + next = parentRanks[(rank+1)%nranks]; + + // Setup my sockets for the allgather ring and other p2p connections + NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); + + // Create socket for other ranks to contact me + NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail); + + // Get addr from next rank + NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail); + NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail); + NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail); + + NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); + NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail); + // Accept the connect request from the previous rank in the AllGather ring + NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail); + + // AllGather all listen handlers + NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail); + memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress)); + NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail); + + if (parent->config.splitShare) { + /* map local rank to top parent local rank. */ + for (int i = 0; i < nranks; ++i) { + comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; + } + comm->proxyState = parent->sharedRes->proxyState; + ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount); + } else { + // Create the service proxy + NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail); + NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail); + NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail); + memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress)); + NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail); + NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail); + } + + INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next); + +exit: + return ret; +fail: + goto exit; +} + ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { struct bootstrapState* state = (struct bootstrapState*)commState; char* data = (char*)allData; @@ -344,7 +409,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s struct bootstrapState* state = (struct bootstrapState*)commState; struct ncclSocket sock; - NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); @@ -405,7 +470,7 @@ ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, } } else { - NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size)); + NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size)); } TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size); diff --git a/src/channel.cc b/src/channel.cc index ed4c623d30..5a06029d9f 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -17,32 +17,122 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { channel->id = channelId; channel->workFifoSent = 0; - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream)); + struct ncclSharedResources* sharedRes = comm->sharedRes; - // The extra on nRanks+1 is for collnet root (i.e. network) - channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nPeers); - NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream)); - ncclCommPushCudaFree(comm, channel->devPeers); + NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); - channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.cudaStream)); - ncclCommPushCudaFree(comm, channel->devRingUserRanks); - - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream)); - CUDACHECK(hipEventRecord(comm->deviceStream.scratchEvent, comm->deviceStream.cudaStream)); - CUDACHECK(hipStreamWaitEvent(comm->deviceStream.cudaStream, comm->deviceStream.scratchEvent, 0)); - - for (int r=0; r < nPeers; ++r) { - for (int b=0; b < NCCL_MAX_CONNS; b++) { - channel->peers[r].send[b].comm = comm; - channel->peers[r].recv[b].comm = comm; + if (channel->peers == NULL) { + // The extra on nRanks+1 is for collnet root (i.e. network) + // Allocate everything related to sharedRes with ncclCalloc as this can be + // shared between communicators hence should not be tied to comm. + if (sharedRes->peers[channelId] == NULL) { + NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks)); + } + channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nPeers); + for (int r = 0; r < nRanks; r++) { + channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r]; + ncclAtomicRefCountIncrement(&channel->peers[r]->refCount); } } + if (channel->devPeers == NULL) { + if (sharedRes->devPeers[channelId] == NULL) { + NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); + } + /* channel->devPeers is not shared, so just free it when calling commFree() */ + NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); + ncclCommPushCudaFree(comm, channel->devPeers); + for (int r = 0; r < nRanks; r++) { + uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + } + } + + channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); + ncclCommPushCudaFree(comm, channel->devRingUserRanks); + + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); + CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream)); + CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0)); + return ncclSuccess; } -ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { +ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { + struct ncclChannel* channel = &comm->channels[channelId]; + struct ncclSharedResources* sharedRes = comm->sharedRes; + + if (channel->nvlsPeers != NULL) + return ncclSuccess; + + if (channel->id == -1) + NCCLCHECK(initChannel(comm, channelId)); + + NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + + if (share) { + channel->nvlsPeers = parent->channels[channelId].nvlsPeers; + channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers; + for (int r = 0; r < comm->localRanks; ++r) { + int tr = comm->topParentLocalRanks[r]; + uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr); + channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr; + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount); + } + } else { + NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks)); + NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream)); + for (int r = 0; r < comm->localRanks; ++r) { + uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r); + channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r; + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount); + } + } + + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); + + return ncclSuccess; +} + +ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { + struct ncclChannel* channel = &comm->channels[channelId]; + struct ncclSharedResources* sharedRes = comm->sharedRes; + uintptr_t addr; + + if (channel->collnetPeers != NULL) + return ncclSuccess; + + if (channel->id == -1) + NCCLCHECK(initChannel(comm, channelId)); + + NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + + if (share) { + channel->collnetPeers = parent->channels[channelId].collnetPeers; + channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers; + addr = (uintptr_t)parent->channels[channelId].collnetDevPeers; + channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers; + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount); + } else { + NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1)); + NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream)); + addr = (uintptr_t)channel->collnetDevPeers; + channel->peers[comm->nRanks] = channel->collnetPeers; + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount); + } + + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); + + return ncclSuccess; +} + +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) { + int nPeers = nRanks + collnetNRanks + nvlsNRanks; /* channel peers are only valid when async init thread completes commAlloc() and * the channel is intialized with initChannel(); if either is not done, this channel * should never be free. */ @@ -50,18 +140,23 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { // Free transport proxy resources // Note: free all send resources first due to CollNet arrangement - for (int r=0; rpeers+r; - for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); + for (int r = 0; r < nPeers; r++) { + struct ncclChannelPeer* peer = channel->peers[r]; + if (peer) { + if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { + for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); + if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); + } + if (r == nRanks) { + free(channel->collnetPeers); + ncclCudaFree(channel->collnetDevPeers); + } else if (r == nPeers - 1) { + free(channel->nvlsPeers); + ncclCudaFree(channel->nvlsDevPeers); + } + } } } - for (int r=0; rpeers+r; - for (int b=0; brecv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); - } - } - return ncclSuccess; } diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index dbfca9b082..e4ad1964b4 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -51,7 +51,7 @@ namespace { T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16); + (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex); #if defined(ENABLE_NPKIT) if (tid == 0) { @@ -85,7 +85,7 @@ namespace { if (inputBuf + chunkOffset == outputBuf + offset) { // In place prims.directSend(chunkOffset, offset, nelem); } else { - prims.directCopySend(chunkOffset, offset, offset, nelem); + prims.directCopySend(chunkOffset, offset, nelem); } // k-2 steps: copy to next GPU @@ -93,7 +93,7 @@ namespace { rankDest = ringRanks[nranks-j]; offset = chunkOffset + rankDest * size; - prims.directRecvCopySend(offset, offset, nelem); + prims.directRecvCopySend(offset, nelem); } // Make final copy from buffer to dest. @@ -148,19 +148,19 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args); + prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0); } } else if (tid < tidEndBcast) { - int group = (3*Proto::MaxGroupWidth) | (1<<16); - // Bcast through MC + // Bcast through NVLS Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args); + prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, + args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 57444ab024..658fc30b57 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -66,7 +66,7 @@ namespace { } Primitives, 0, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); + (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex); #if defined(ENABLE_NPKIT) if (tid == 0) { @@ -158,7 +158,7 @@ namespace { } #endif - prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true); + prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) if (tid == 0) { @@ -180,7 +180,7 @@ namespace { chunk = modRanks(ringIx + nranks-j); offset = calcOffset(chunk); nelem = min(realChunkSize, size-offset); - prims.directRecvCopySend(offset, offset, nelem); + prims.directRecvCopySend(offset, nelem); } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT) @@ -342,7 +342,7 @@ namespace { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directSendFromOutput(offset, offset, nelem); + prims.directSendFromOutput(offset, nelem); } } else if (tree->down[0] == -1) { @@ -356,7 +356,7 @@ namespace { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directRecvCopySend(offset, offset, nelem); + prims.directRecvCopySend(offset, nelem); } } @@ -446,7 +446,7 @@ namespace { chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize); if (tree->up == -1) { - // Reduce and broadcast. Max number of recv is 3, max number of send is 3 + // Reduce and broadcast. Max number of recv is 2, max number of send is 2 Primitives, /*Direct=*/0, Proto, 0> prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); @@ -467,7 +467,7 @@ namespace { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true); + prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true); } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT) @@ -530,7 +530,8 @@ namespace { else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/0, Proto, 0> - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth); + prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, + args->redOpArg, 1*Proto::MaxGroupWidth); #if defined(ENABLE_NPKIT) if (isNpKitThread) { @@ -557,7 +558,7 @@ namespace { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); - prims.directRecvCopySend(offset, offset, nelem); + prims.directRecvCopySend(offset, nelem); } } @@ -621,9 +622,9 @@ struct RunWorkElement= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter - int group = (2*Proto::MaxGroupWidth) | (1<<16); Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, + args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); @@ -634,16 +635,16 @@ struct RunWorkElement= tidStartReduce && direct->out != -1) { - int group = (3*Proto::MaxGroupWidth) | (1<<16); if (hasDn) { // Reduce, send to network Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, + args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); if (args->regUsed) { - prims.directRecvReduceSend(offset, offset, nelem); + prims.directRecvReduceSend(offset, nelem); } else { prims.recvReduceSend(offset, nelem); } @@ -651,7 +652,8 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group); + prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, + args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -660,29 +662,30 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, + args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { - int group = (1*Proto::MaxGroupWidth) | (0<<16); if (hasDn) { // Recv from network, broadcast Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, + args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); - prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true); + prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true); } } else { // Recv from network (no post thread needed) Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group); + prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, + args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -705,23 +708,27 @@ struct RunWorkElementcount; const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize; const int nranks = ncclShmem.comm.nRanks; - const int reduceWarps = nranks <= 6 ? 6 : 4; - const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2; + const bool hasOut = nvls->out != -1; + const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5; + const int bcastWarps = hasOut ? 2 : 0; + const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; + const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; - const int nThreadsScatter = copyWarps*WARP_SIZE; - const int nThreadsGather = (copyWarps-1)*WARP_SIZE; - const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE; + const int nThreadsScatter = scatterWarps*WARP_SIZE; + const int nThreadsGather = gatherWarps*WARP_SIZE; + const int nThreadsReduce = reduceWarps*WARP_SIZE; + const int nThreadsBcast = (bcastWarps)*WARP_SIZE; const int tidEndScatter = nThreadsScatter; const int tidEndGather = tidEndScatter + nThreadsGather; const int tidEndReduce = tidEndGather + nThreadsReduce; - - using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>; + const int tidEndBcast = tidEndReduce + nThreadsBcast; if (tid < tidEndScatter) { // Scatter - int group = (0*Proto::MaxGroupWidth) | (0<<16); + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, + args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); @@ -729,19 +736,136 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } - } else if (tid < tidEndReduce) { - int group = (3*Proto::MaxGroupWidth) | (1<<16); - // Reduce, broadcast through NVLS + } else if (tid < tidEndReduce && nvls->headRank != -1) { + if (!hasOut) { + // Reduce, broadcast through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, + args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; + int nelem = min(chunkSize, size-offset); + prims.recvSend(nelem); + } + } else { + // Reduce, send to network + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, + args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; + int nelem = min(chunkSize, size-offset); + prims.recvSend(nelem); + } + } + } else if (tid < tidEndBcast && nvls->headRank != -1) { + // Recv from network, broadcast + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, + args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; + int nelem = min(chunkSize, size-offset); + prims.recvSend(nelem); + } + } + #endif // NCCL_NVLS_ENABLED + } +}; + +template +struct RunWorkElement { + __device__ __forceinline__ void run(ncclWorkElem *args) { + #if NCCL_NVLS_ENABLED + const int tid = threadIdx.x; + const int bid = args->bid; + const int nChannels = args->nChannels; + struct ncclNvls* nvls = &ncclShmem.channel.nvls; + const int treeUp = nvls->treeUp; + const int* treeDown = nvls->treeDown; + const ssize_t chunkSize = int(args->lastChunkSize); + const ssize_t size = args->count; + const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize; + const int nranks = ncclShmem.comm.nRanks; + const bool hasUp = treeUp != -1; + const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5; + const int bcastWarps = hasUp ? 4 : 0; + const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; + const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; + + const int nThreadsScatter = scatterWarps*WARP_SIZE; + const int nThreadsGather = gatherWarps*WARP_SIZE; + const int nThreadsReduce = reduceWarps*WARP_SIZE; + const int nThreadsBcast = (bcastWarps)*WARP_SIZE; + const int tidEndScatter = nThreadsScatter; + const int tidEndGather = tidEndScatter + nThreadsGather; + const int tidEndReduce = tidEndGather + nThreadsReduce; + const int tidEndBcast = tidEndReduce + nThreadsBcast; + + if (tid < tidEndScatter) { + // Scatter + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, + args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; + int nelem = min(nvls->nHeads*chunkSize, size-offset); + prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); + } + } else if (tid < tidEndGather) { + // Gather + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; + int nelem = min(nvls->nHeads*chunkSize, size-offset); + prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); + } + } else if (tid < tidEndReduce && nvls->headRank != -1) { + if (!hasUp) { + // Reduce and Broadcast + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, + args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; + int nelem = min(chunkSize, size-offset); + prims.recvSend(nelem); + } + } else { + // Reduce, send to network + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, + args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; + int nelem = min(chunkSize, size-offset); + prims.recvSend(nelem); + } + } + } else if (tid < tidEndBcast && nvls->headRank != -1) { + // Recv from network, broadcast + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, + args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -762,21 +886,26 @@ struct RunWorkElementlastChunkSize); const ssize_t loopSize = int(nChannels*chunkSize); + const int nranks = ncclShmem.comm.nRanks; const ssize_t size = args->count; int nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; - int group, send, recv, groupTid, groupNthreads; + int group, connIndex, send, recv, groupTid, groupNthreads; using Proto = ProtoSimple<1, 1>; if (tid < nthreadsSplit) { - group = (0*Proto::MaxGroupWidth) | (1<<16); + // Reduce up the chain + group = 0; + connIndex = 1; recv = tree->down[0]; send = tree->up; groupTid = tid; groupNthreads = nthreadsSplit; } else { - group = (1*Proto::MaxGroupWidth); + // Broadcast down the chain + group = 1; + connIndex = 0; recv = tree->up; send = tree->down[0]; groupTid = tid - nthreadsSplit; @@ -784,7 +913,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group); + prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, + args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex); if (tid < nthreadsSplit) { if (recv == -1) { @@ -802,17 +932,34 @@ struct RunWorkElementsendbuff != args->recvbuff) { const T* sendbuff = (const T*)args->sendbuff + send_offset; T* recvbuff = (T *)args->recvbuff + recv_offset; - ReduceOrCopyMulti( + reduceCopy( tid, nthreads, 0, nullptr, false, 1, (void **)&sendbuff, 1, (void **)&recvbuff, send_recv_size); } else { for (ssize_t prims_offset = 0; prims_offset < send_recv_size; prims_offset += prims_size) { diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index db3aed51e8..5dc72b5a4c 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -50,7 +50,7 @@ namespace { T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16); + prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex); #if defined(ENABLE_NPKIT) if (tid == 0) { diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index e8ec8a57c5..29a951ab11 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -42,7 +42,8 @@ NCCL_FUNC5(func, RING, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \ - NCCL_FUNC5(func, NVLS, devredop, type, nullify) + NCCL_FUNC5(func, NVLS, devredop, type, nullify), \ + NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(func, devredop, nullForFloat) \ @@ -119,8 +120,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ #endif }; -static_assert(FUNC_INDEX_P2P == 4510, "Wrong P2P function index"); -static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 4511, "Wrong AllToAllPivot function index"); +static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index"); +static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index"); #ifndef USE_INDIRECT_FUNCTION_CALL template @@ -180,46 +181,46 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { else assert("Unsupported function index"); #else - if (funcIndex < 900) { - if (funcIndex % 15 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); - else if (funcIndex % 15 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); - else if (funcIndex % 15 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t(); - else if (funcIndex % 15 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t(); + if (funcIndex < 1080) { + if (funcIndex % 18 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); + else if (funcIndex % 18 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); + else if (funcIndex % 18 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t(); + else if (funcIndex % 18 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t(); else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t(); } - else if (funcIndex < 1800) Caller<900, 1800, USING_LL128>::call(funcIndex); - else if (funcIndex < 2700) { - if (funcIndex % 15 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); - else if (funcIndex % 15 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(); - else if (funcIndex % 15 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t(); - else if (funcIndex % 15 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t(); - else if (funcIndex % 15 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t(); - else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t(); - else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t(); + else if (funcIndex < 2160) Caller<1080, 2160, USING_LL128>::call(funcIndex); + else if (funcIndex < 3240) { + if (funcIndex % 18 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); + else if (funcIndex % 18 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(); + else if (funcIndex % 18 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t(); + else if (funcIndex % 18 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t(); + else if (funcIndex % 18 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t(); + else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t(); + else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t(); else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t(); } - else if (funcIndex < 4500) Caller<2700, 4500, USING_LL128>::call(funcIndex); + else if (funcIndex < 5400) Caller<3240, 5400, USING_LL128>::call(funcIndex); else { - switch (funcIndex - 4500) { + switch (funcIndex - 5400) { case 0: ncclFunction_OneRankReduce_PreMulSum_int8_t(); break; @@ -353,7 +354,6 @@ struct ncclShmemGroup { ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY]; void* srcs[NCCL_MAX_NVLS_ARITY+1]; void* dsts[NCCL_MAX_NVLS_ARITY+1]; - int nvlsRecv; uint64_t barrier; uint64_t barrier_next[NCCL_MAX_GROUPS]; }; @@ -621,7 +621,8 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev IMPL_COLL4(func, RING, devredop, type) \ IMPL_COLL4(func, COLLNET_DIRECT, devredop, type) \ IMPL_COLL4(func, COLLNET_CHAIN, devredop, type) \ - IMPL_COLL4(func, NVLS, devredop, type) + IMPL_COLL4(func, NVLS, devredop, type) \ + IMPL_COLL4(func, NVLS_TREE, devredop, type) #define IMPL_COLL2(func, devredop) \ IMPL_COLL3(func, devredop, int8_t) \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index cd1aa5b6c8..f204aee88e 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -28,7 +28,8 @@ inline __device__ int loadInt(int* ptr) { } template __device__ __forceinline__ void reduceCopyPacks( int nThreads, int &thread, @@ -37,6 +38,7 @@ __device__ __forceinline__ void reduceCopyPacks( IntBytes &nBytesBehind, IntBytes &nBytesAhead ) { static_assert(std::is_signed::value, "IntBytes must be a signed integral type."); + //if (BytePerPack == 0) __trap(); // A hunk is the amount of contiguous data a warp consumes per loop iteration // assuming all threads partake. @@ -49,15 +51,15 @@ __device__ __forceinline__ void reduceCopyPacks( IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack); IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack); // Number of hunks to be consumed over all warps. - IntBytes nHunksAhead = nBytesAhead/BytePerHunk; + IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk); // Advance collective position. nBytesBehind += nHunksAhead*BytePerHunk; nBytesAhead -= nHunksAhead*BytePerHunk; if (Unroll==1 && BytePerPack <= nBytesAhead) { // Only Unroll=1 can do partial hunks (where not all threads partake). nHunksAhead += 1; - nBytesBehind += nBytesAhead - (nBytesAhead%BytePerPack); - nBytesAhead = nBytesAhead%BytePerPack; + nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack)); + nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack); } nHunksAhead -= warp; @@ -79,8 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks( { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { - // Use volatile loads in case credits are polled for with volatile (instead of acquire). - acc[u] = ld_volatile_global(minSrcs[0]); + if (0 < MultimemSrcs) { + // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. + acc[u] = applyLoadMultimem(preFn, minSrcs[0]); + } else { + // Use volatile loads in case credits are polled for with volatile (instead of acquire). + acc[u] = ld_volatile_global(minSrcs[0]); + } minSrcs[0] += WARP_SIZE*BytePerPack; if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]); } @@ -92,8 +99,13 @@ __device__ __forceinline__ void reduceCopyPacks( RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { - // Use volatile loads in case credits are polled for with volatile (instead of acquire). - tmp[u] = ld_volatile_global(minSrcs[s]); + if (s < MultimemSrcs) { + // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. + acc[u] = applyLoadMultimem(preFn, minSrcs[s]); + } else { + // Use volatile loads in case credits are polled for with volatile (instead of acquire). + tmp[u] = ld_volatile_global(minSrcs[s]); + } minSrcs[s] += WARP_SIZE*BytePerPack; } #pragma unroll Unroll @@ -130,7 +142,11 @@ __device__ __forceinline__ void reduceCopyPacks( for (int d=0; d < MinDsts; d++) { #pragma unroll Unroll for (int u=0; u < Unroll; u++) { - st_global(minDsts[d], acc[u]); + if (d < MultimemDsts) { + multimem_st_global(minDsts[d], acc[u]); + } else { + st_global(minDsts[d], acc[u]); + } minDsts[d] += WARP_SIZE*BytePerPack; } } @@ -167,215 +183,61 @@ __device__ __forceinline__ void reduceCopyPacks( } template -__device__ __forceinline__ void ReduceOrCopyMulti( +__device__ __forceinline__ void reduceCopy( int thread, int nThreads, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs, IntBytes nElts ) { + static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values."); //int nWarps = nThreads/WARP_SIZE; //int warp = thread/WARP_SIZE; int lane = thread%WARP_SIZE; - - // Check that all is 16B aligned. If not don't use 16B load/stores. - int aligned = 1; - if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane])%4; - if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane])%4; - aligned = !(__any(!aligned)); + // If a multimem src is present then our biggest pack size is limited to what + // is supported for this redfn/type. + constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize::BigPackSize; IntBytes nBytesBehind = 0; IntBytes nBytesAhead = nElts*sizeof(T); - if (aligned) { - reduceCopyPacks - (nThreads, /*&*/thread, redArg, preOpArgs, postOp, - nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); - if (nBytesAhead == 0) return; - reduceCopyPacks - (nThreads, /*&*/thread, redArg, preOpArgs, postOp, - nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); - if (nBytesAhead == 0) return; + #if __cpp_if_constexpr + if constexpr (BigPackSize > sizeof(T)) { + #else + if (BigPackSize > sizeof(T)) { + #endif + // Check that all pointers are BigPackSize aligned. + bool aligned = true; + if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize); + if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize); + aligned = !(__any(!aligned)); + if (aligned) { + reduceCopyPacks + (nThreads, /*&*/thread, redArg, preOpArgs, postOp, + nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); + if (nBytesAhead == 0) return; + + reduceCopyPacks + (nThreads, /*&*/thread, redArg, preOpArgs, postOp, + nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); + if (nBytesAhead == 0) return; + } } reduceCopyPacks + MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs> (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; reduceCopyPacks + MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs> (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); } -// Copies from srcAddr to dstAddr using multimem load/store. The amount copied -// will be at most Unroll*BytePerPack*WARP_SIZE. If Partial=1, then the amount -// will be the min() of that and nBytesAhead. If srcAddr is not BytePerPack -// aligned then the amount copied will be less by (srcAddr%BytePerPack) since -// we begin loads at the first pack containing the first element. -template -__device__ __forceinline__ void copyMultimemMultimem_WarpUnrolled( - int lane, RedFn redFn, bool postOp, uintptr_t srcAddr, uintptr_t dstAddr, - IntBytes nBytesAhead, uint32_t scratchAddr - ) { -#if 0 - int srcMisalign = SrcAligned ? 0 : srcAddr%BytePerPack; - srcAddr -= srcMisalign; - - BytePack reg[Unroll]; - int offset = lane*BytePerPack; - #pragma unroll Unroll - for (int u=0; u < Unroll; u++) { - if (!Partial || (offset < srcMisalign + nBytesAhead)) { - reg[u] = applyLoadMultimem(redFn, srcAddr+offset); - if (postOp) reg[u] = applyPostOp(redFn, reg[u]); - } - offset += WARP_SIZE*BytePerPack; - } - - if (SrcAligned && DstAligned) { - offset = lane*BytePerPack; - #pragma unroll Unroll - for (int u=0; u < Unroll; u++) { - if (!Partial || offset < nBytesAhead) { - multimem_st_global(dstAddr+offset, reg[u]); - } - offset += WARP_SIZE*BytePerPack; - } - } else { - __syncwarp(); - offset = lane*BytePerPack; - #pragma unroll Unroll - for (int u=0; u < Unroll; u++) { - if (!Partial || (offset < srcMisalign + nBytesAhead)) { - st_shared(scratchAddr+offset, reg[u]); - } - offset += WARP_SIZE*BytePerPack; - } - __syncwarp(); - if (!SrcAligned) { - // Ignore the beginning of the first pack corresponding to bytes overread - // due to misalignment. - nBytesAhead = min(nBytesAhead, Unroll*WARP_SIZE*BytePerPack - srcMisalign); - } - copyGlobalShared_WarpUnrolled - - (lane, dstAddr, scratchAddr+srcMisalign, nBytesAhead); - } -#endif -} - -// copyMultimemMultimem_IfEnabled has two overloads: the enabled case whose first arg -// has type `std::true_type` and the disabled case with first arg `std::false_type`. -// This is to guard the template instantiations of Apply_LoadMultimem on types/ops where -// they aren't supported. A nicer approach is to use C++17's "if constexpr". -template -__device__ __forceinline__ void copyMultimemMultimem_IfEnabled( - std::false_type enabled/*=false*/, - int thread, int nThreads, uint64_t redArg, bool postOp, - void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr - ) { - // nop -} - -template -__device__ __forceinline__ void copyMultimemMultimem_IfEnabled( - std::true_type enabled/*=true*/, - int thread, int nThreads, uint64_t redArg, bool postOp, - void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr - ) { - static_assert(std::is_signed::value, "IntBytes must be a signed integral type."); - - constexpr int BytePerPack = Apply_LoadMultimem::PackSize; - using T = typename RedFn::EltType; - constexpr int Unroll = ncclNvlsUnroll(BytePerPack); - constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack; - int nWarps = nThreads/WARP_SIZE; - int warp = thread/WARP_SIZE; - int lane = thread%WARP_SIZE; - RedFn redFn(redArg); - - uintptr_t srcAddr = cvta_to_global(srcPtr); - uintptr_t dstAddr = cvta_to_global(dstPtr); - IntBytes warpBytesAhead = nElts*sizeof(T); - bool partialHunkIsFront; - - // First handle misalignment of srcAddr. - if ((BytePerPack != sizeof(T)) && (srcAddr%BytePerPack != 0)) { - // If srcAddr isn't pack aligned then the first hunk processed will be short - // the same number of bytes as srcAddr's misalignment. - if (warp == 0) { - partialHunkIsFront = true; - goto PartialHunk; // "call" PartialHunk() - PartialHunkFrontReturn: - warp = nWarps; - } - warp -= 1; // Rotate warp numbers for load balancing - int advanced = BytePerHunk-(srcAddr%BytePerPack); // since copyMultimemMultimem_WarpUnrolled shorts by the misalignment - srcAddr += advanced; // srcAddr is now pack aligned - dstAddr += advanced; - warpBytesAhead -= advanced; - } - - warpBytesAhead -= warp*BytePerHunk; - srcAddr += warp*BytePerHunk; - dstAddr += warp*BytePerHunk; - // Now that srcAddr is pack aligned detect if dstAddr is pack aligned. - if ((BytePerPack == sizeof(T)) || (dstAddr%BytePerPack == 0)) { - while (BytePerHunk <= warpBytesAhead) { - copyMultimemMultimem_WarpUnrolled - - (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr); - srcAddr += nWarps*BytePerHunk; - dstAddr += nWarps*BytePerHunk; - warpBytesAhead -= nWarps*BytePerHunk; - } - } else { - while (BytePerHunk <= warpBytesAhead) { - copyMultimemMultimem_WarpUnrolled - - (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr); - srcAddr += nWarps*BytePerHunk; - dstAddr += nWarps*BytePerHunk; - warpBytesAhead -= nWarps*BytePerHunk; - } - } - - if (0 < warpBytesAhead) { - partialHunkIsFront = false; - goto PartialHunk; // "call" PartialHunk() - PartialHunkBackReturn:; - } - return; - -PartialHunk: - // We have to handle a partial hunk possibly at the front and back of the - // buffer. We generate the code once here since its a lot of instructions, - // and then simulate function calls with gotos. - copyMultimemMultimem_WarpUnrolled - - (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr); - if (partialHunkIsFront) goto PartialHunkFrontReturn; - goto PartialHunkBackReturn; -} - -template -__device__ __forceinline__ void copyMultimemMultimem( - int thread, int nThreads, uint64_t redArg, bool postOp, - void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr - ) { - constexpr bool Enabled = Apply_LoadMultimem::PackSize != 0; - copyMultimemMultimem_IfEnabled( - /*enabled=*/std::integral_constant(), - thread, nThreads, redArg, postOp, srcPtr, dstPtr, nElts, warpScratchAddr); -} #endif // COMMON_KERNEL_H_ diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 018e511c0b..85695f059e 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -26,7 +26,8 @@ __shared__ ncclShmemData ncclShmem; NCCL_FUNC5(func, RING, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \ - NCCL_FUNC5(func, NVLS, devredop, type, nullify) + NCCL_FUNC5(func, NVLS, devredop, type, nullify), \ + NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify) #if defined(__CUDA_BF16_TYPES_EXIST__) // Must be consistent with ncclDataType_t diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index 9ebe8eea84..0c40f9afce 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -42,7 +42,7 @@ namespace { dst += i0; void *vsrc = (void*)src; void *vdst = (void*)dst; - ReduceOrCopyMulti + reduceCopy (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0); } } diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h index 8ee13fbb6f..5b694545c5 100644 --- a/src/collectives/device/op128.h +++ b/src/collectives/device/op128.h @@ -7,6 +7,8 @@ #ifndef OP128_H_ #define OP128_H_ +#include + inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { v0 = __builtin_nontemporal_load(ptr); v1 = __builtin_nontemporal_load(ptr+1); @@ -88,6 +90,8 @@ __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) { template union BytePack; template<> +union BytePack<0> {}; +template<> union BytePack<1> { uint8_t u8, native; }; @@ -130,14 +134,26 @@ union alignas(16) BytePack<16> { }; template -__device__ __forceinline__ BytePack toPack(T value) { - union { BytePack p; T v; }; +struct BytePackOf { + static constexpr int Size = sizeof(T); + using Pack = BytePack; +}; +template<> +struct BytePackOf> { + static constexpr int Size = 0; + using Pack = BytePack<0>; +}; + +template +__device__ __forceinline__ typename BytePackOf::Pack toPack(T value) { + union { typename BytePackOf::Pack p; T v; }; v = value; return p; } + template -__device__ __forceinline__ T fromPack(BytePack pack) { - union { BytePack p; T v; }; +__device__ __forceinline__ T fromPack(typename BytePackOf::Pack pack) { + union { typename BytePackOf::Pack p; T v; }; p = pack; return v; } @@ -152,6 +168,13 @@ template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ void st_global(uintptr_t addr, BytePack value); //template __device__ void st_shared(uint32_t addr, BytePack value); +template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; } +template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } +//template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; } +//template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; } +template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {} +//template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {} + // Used to define implementations for above prototypes. #define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ template<> \ @@ -255,6 +278,18 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack= 900 && CUDART_VERSION >= 12010 template<> +__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) { + // nop +} +template<> +__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) { + asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory"); +} +template<> +__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) { + asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory"); +} +template<> __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) { asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory"); } diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 024df18c64..170a9cad43 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -37,13 +37,14 @@ * to how that protocol operates with a consistent interface so that our * algorithm code can operate protocol parametrically. */ -template +template struct ProtoSimple { static constexpr int Id = NCCL_PROTO_SIMPLE; static constexpr int SlicePerChunk = SlicePerChunk_1; static constexpr int StepPerSlice = StepPerSlice_1; static constexpr int Unroll = Unroll_1; - static constexpr bool NVLS = NVLS_1; + static constexpr int MultimemSrcs = MultimemSrcs_1; + static constexpr int MultimemDsts = MultimemDsts_1; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { @@ -55,9 +56,6 @@ struct ProtoSimple { } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; - __device__ static int calcGroupWidth(bool send, int nthreads) { - return 1; - } }; struct ProtoLL { @@ -73,9 +71,6 @@ struct ProtoLL { } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; - __device__ static int calcGroupWidth(bool send, int nthreads) { - return 1; - } }; struct ProtoLL128 { @@ -91,9 +86,6 @@ struct ProtoLL128 { } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; - __device__ static int calcGroupWidth(bool send, int nthreads) { - return 1; - } }; /* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template @@ -133,22 +125,22 @@ class Primitives; // Used by LL & LL128 to implement direct members in the naive way. template struct PrimitivesWithoutDirect { - __device__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) { + __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { static_cast(this)->send(inpIx, eltN); } - __device__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) { + __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } __device__ void directRecv(intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } - __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { + __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->copySend(inpIx, outIx, eltN, postOp); } - __device__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) { + __device__ void directRecvCopySend(intptr_t outIx, int eltN) { static_cast(this)->recvCopySend(outIx, eltN, /*postOp=*/false); } - __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { + __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); } diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index a0060dae46..faa2b03770 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -566,24 +566,24 @@ private: public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, - void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0 + void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, + uint8_t connIndexRecv=0, uint8_t connIndexSend=0 ): redOp(redOpArg), - tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF), + tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { - int connIndex = group >> 16; auto *channel = &ncclShmem.channel; - barriers = &ncclShmem.groups[this->group].barrier; - barrier_next = ncclShmem.groups[this->group].barrier_next; + barriers = &ncclShmem.groups[group].barrier; + barrier_next = ncclShmem.groups[group].barrier_next; // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1 while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv); + loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend); + loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 0529464f36..469b843890 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -32,6 +32,7 @@ class Primitives: const int wid; const int stepSize; const int warp; + const int warpInBlock; // warp index in thread block const bool flagThread; const int group; Fan fan; @@ -488,23 +489,24 @@ private: public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, - void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0 + void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, + uint8_t connIndexRecv=0, uint8_t connIndexSend=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), - flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF), + warpInBlock(threadIdx.x/WARP_SIZE), + flagThread((tid%4)==3), group(group), stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { - int connIndex = group >> 16; auto *channel = &ncclShmem.channel; - barriers = &ncclShmem.groups[this->group].barrier; - barrier_next = ncclShmem.groups[this->group].barrier_next; + barriers = &ncclShmem.groups[group].barrier; + barrier_next = ncclShmem.groups[group].barrier_next; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { - loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv); + loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { - loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend); + loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index fb5b0e0af9..27c02bf0bd 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -13,9 +13,9 @@ #include "msccl/msccl_struct.h" template + int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts> class Primitives< - T, RedOp, Fan, Direct, ProtoSimple, P2p + T, RedOp, Fan, Direct, ProtoSimple, P2p > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -31,10 +31,9 @@ class Primitives< DirectWrite = 0x200, DirectRead = 0x400, ThreadsSynced = 0x800, - NvlsMinPolling = 0x1000, - NvlsRecv = 0x2000; + NvlsMinPolling = 0x1000; const int tid, tidInBlock; - int nthreads; + const int nthreads; int nworkers; const int stepSize; Fan fan; @@ -93,19 +92,19 @@ private: inline __device__ uint64_t loadStepValue(uint64_t* ptr) { #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 - if (NVLS && (flags & NvlsMinPolling)) { + if (flags & NvlsMinPolling) { uint64_t ans; asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); return ans; } #endif - // volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti + // volatile is faster than acquire but not as correct. Make sure reduceCopy // loads data using volatile so it doesn't see stale data in L1. return atomicAdd((unsigned long long *)ptr, 0); } template - __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) { + __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write @@ -132,7 +131,7 @@ private: ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { if (flags & DirectWrite) { - ptrs[index] = directBuff + remoteIx + offset; + ptrs[index] = directBuff + dstIx + offset; } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { @@ -140,7 +139,7 @@ private: } } else if (!isSendNotRecv && DirectRecv) { if (flags & DirectRead) { - ptrs[index] = directBuff + remoteIx + offset; + ptrs[index] = directBuff + srcIx + offset; } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { @@ -173,7 +172,7 @@ private: template __device__ __forceinline__ void genericOp( - intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp + intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp ) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; @@ -217,17 +216,12 @@ private: ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset; if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput))) ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset; - waitPeer(dstIx, remoteIx, offset, sliceSize); + waitPeer(srcIx, dstIx, offset, sliceSize); subBarrier(); /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size * to 0 to avoid unnecessary workload. */ int workSize = ncclShmem.aborted ? 0 : sliceSize; - if (NVLS && ncclShmem.groups[group].nvlsRecv) { - void* src = ncclShmem.groups[group].srcs[0]; - void* dst = ncclShmem.groups[group].dsts[0]; - copyMultimemMultimem(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize, - cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE))); - } else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { + if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { @@ -244,7 +238,7 @@ private: } #endif - ReduceOrCopyMulti + reduceCopy (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false, 1, ncclShmem.groups[group].srcs, fan.nsend(), ncclShmem.groups[group].dsts+1, @@ -280,7 +274,7 @@ private: } #endif - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp, Recv, ncclShmem.groups[group].srcs, Dst, ncclShmem.groups[group].dsts, @@ -316,7 +310,9 @@ private: constexpr int PreOpSrcs = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs, Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts, @@ -370,19 +366,19 @@ private: srcs[nsrcs] = dsts[0]; nsrcs++; if (MULTISRCS){ - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem); } else { - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem); } } if (COPY){ - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem); if (MULTISRCS) { for (int i = 1; i < nsrcs; i++){ - ReduceOrCopyMulti + reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem); } } @@ -425,7 +421,7 @@ private: void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset; int realPeerSize = min(realSize, totalElem-pOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { - ReduceOrCopyMulti(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize); + reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize); // Mark for threadfence at the end fenceNeeded |= true; } @@ -437,18 +433,15 @@ private: // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx, outIx+pOffset, offset, realSize); subBarrier(); - if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { - // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv - // Do nothing - } else { - for (int j=0; j= 0 && i >= skip) pOffset += peerElem; - void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; - int realPeerSize = min(realSize, totalElem-pOffset); - if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); - } + #pragma unroll 1 + for (int j=0; j= 0 && i >= skip) pOffset += peerElem; + void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; + int realPeerSize = min(realSize, totalElem-pOffset); + if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0; + if (realPeerSize > 0) reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); } } } @@ -469,14 +462,7 @@ private: } if (flags & RoleWaitRecv) { ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() - if ((index == 0) && (flags & RoleWaitRecv)) { - if (conn->flags & NCCL_NVLS_MIN_POLL) { - flags |= NvlsMinPolling; - ncclShmem.groups[group].nvlsRecv = 1; - } else { - ncclShmem.groups[group].nvlsRecv = 0; - } - } + flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; connStepPtr = conn->tail; connStepCache = loadStepValue(connStepPtr); flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; @@ -554,18 +540,16 @@ private: public: __forceinline__ __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, - void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr + void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, + uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr ): - tid(tid), tidInBlock(threadIdx.x), + tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { // For send operations, we need an extra warp to overlap the threadfence and the copy - this->nthreads = nthreads; + barriers = &ncclShmem.groups[group].barrier; + barrier_next = ncclShmem.groups[group].barrier_next; this->nworkers = nthreads; - this->group = group & (uint16_t)0xFFFF; - int connIndex = group >> 16; - barriers = &ncclShmem.groups[this->group].barrier; - barrier_next = ncclShmem.groups[this->group].barrier_next; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; @@ -595,8 +579,8 @@ private: if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; - loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e); - loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e); + loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); + loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } @@ -707,62 +691,62 @@ private: } __device__ __forceinline__ void send(intptr_t inpIx, int eltN) { - genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, -1, eltN, false); + genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) { - genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, -1, eltN, false); + genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false); } - __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) { - genericOp<0, 1, 0, 1, Input, -1>(inpIx, -1, remoteOutIx, eltN, false); + __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { + genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false); } - __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) { - genericOp<0, 1, 0, 1, Output, -1>(outIx, -1, remoteOutIx, eltN, false); + __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) { + genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false); } __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { - genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, postOp); + genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { - genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, /*postOp=*/false); + genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false); } __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp); + genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } - __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { - genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp); + __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) { - genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp); + genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp); } __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { - genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp); + genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) { - genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false); + __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) { + genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false); } - __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { - genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp); + __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp); + genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { - genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp); + genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } - __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) { - genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp); + __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp); + genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { + __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part - genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp); + genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 790eca0efa..491258ea62 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -31,7 +31,7 @@ namespace { const int root = args->root; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex); auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { int realChunkSize; diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h index ab490450ef..b5336570f4 100644 --- a/src/collectives/device/reduce_kernel.h +++ b/src/collectives/device/reduce_kernel.h @@ -56,9 +56,14 @@ struct Apply_PostOp/*{ static BytePack postOp(Fn fn, BytePack a); }*/; template +struct LoadMultimem_BigPackSize/*{ + // If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem, + // otherwise there are no valid pack sizes for LoadMultimem. + static constexpr int BigPackSize = 0; +}*/; +template struct Apply_LoadMultimem/*{ - static constexpr int PackSize; // 0 if not implemented - static BytePack load(Fn fn, uintptr_t addr); + static BytePack load(Fn fn, uintptr_t addr); }*/; //////////////////////////////////////////////////////////////////////////////// @@ -70,7 +75,7 @@ struct Apply_LoadMultimem/*{ template __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { return fromPack( - Apply_Reduce + Apply_Reduce::Size/sizeof(typename Fn::EltType)> ::reduce(fn, toPack(a), toPack(b)) ); } @@ -78,7 +83,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { template __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { return fromPack( - Apply_PreOp + Apply_PreOp::Size/sizeof(typename Fn::EltType)> ::preOp(fn, toPack(a)) ); } @@ -86,19 +91,27 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { template __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) { return fromPack( - Apply_PostOp + Apply_PostOp::Size/sizeof(typename Fn::EltType)> ::postOp(fn, toPack(a)) ); } -template -__device__ __forceinline__ BytePack::PackSize> applyLoadMultimem(Fn fn, uintptr_t addr) { - return Apply_LoadMultimem::load(fn, addr); +template +__device__ __forceinline__ BytePack applyLoadMultimem(Fn fn, uintptr_t addr) { + return Apply_LoadMultimem::load(fn, addr); } //////////////////////////////////////////////////////////////////////////////// // Apply_Reduce +// Nonsensical base case +template +struct Apply_Reduce { + __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { + return {}; + } +}; + // General recursive definition (EltPerPack > 1). This is how we iterate over // all elements in a pack of any size, by breaking it into halves. Eventually // we'll hit a base case (a more specific template specialization which takes @@ -283,6 +296,14 @@ struct Apply_PreOp { return a; } }; +// Base case definition (EltPerPack == 0), is nonsense! +template +struct Apply_PreOp { + static constexpr bool IsIdentity = true; + __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) { + return {}; + } +}; //////////////////////////////////////////////////////////////////////////////// // Apply_PostOp @@ -316,6 +337,14 @@ struct Apply_PostOp { return a; } }; +// Base case definition (EltPerPack == 0), is nonsense! +template +struct Apply_PostOp { + static constexpr bool IsIdentity = true; + __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) { + return {}; + } +}; //////////////////////////////////////////////////////////////////////////////// @@ -506,11 +535,6 @@ struct Apply_PostOp, /*EltPerPack=*/1> { //////////////////////////////////////////////////////////////////////////////// // Apply_LoadMultimem -template -struct Apply_LoadMultimem { - static constexpr int PackSize = 0; // Indicates not implemented -}; - #define SIZEOF_BytePack_field_u16 2 #define PTX_REG_BytePack_field_u16 "h" @@ -522,11 +546,11 @@ struct Apply_LoadMultimem { #define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \ template<> \ - struct Apply_LoadMultimem> { \ - static constexpr int PackSize = 1*(SIZEOF_BytePack_field_##pack_field); \ + struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ + static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ __device__ static BytePack load(Fn fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.global." #op "." #ptx_ty " %0, [%1];" \ + asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ return ans; \ @@ -534,11 +558,11 @@ struct Apply_LoadMultimem { }; #define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ template<> \ - struct Apply_LoadMultimem> { \ + struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ __device__ static BytePack load(Fn fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ @@ -547,8 +571,45 @@ struct Apply_LoadMultimem { return ans; \ } \ }; +#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \ + DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ + template<> \ + struct Apply_LoadMultimem, sizeof(T)> { \ + __device__ static BytePack load(Fn fn, uintptr_t addr) { \ + BytePack<2*sizeof(T)> tmp; \ + asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ + : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ + : "l"(addr & -uintptr_t(sizeof(T)))); \ + return tmp.half[(addr/sizeof(T))%2]; \ + } \ + }; + +template +struct Apply_LoadMultimem { + __device__ static BytePack load(Fn fn, uintptr_t addr) { + //__trap(); + return {}; + } +}; #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 + template + struct LoadMultimem_BigPackSize { + using T = typename Fn::EltType; + static constexpr bool IsSum = std::is_same>::value || + std::is_same>::value || + std::is_same>::value; + static constexpr bool IsMinOrMax = std::is_same>::value || + std::is_same>::value; + static constexpr bool IsFloat = IsFloatingPoint::value; + static constexpr int BigPackSize = + IsFloat && IsSum && sizeof(T) < 8 ? 16 : + IsFloat && IsSum ? 8 : + IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 : + !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) : + /*multimem.ld_reduce not supported:*/ 0; + }; + DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32) DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32) DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32) @@ -565,23 +626,30 @@ struct Apply_LoadMultimem { DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64) DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64) + DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32) DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32) DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64) - DEFINE_Apply_LoadMultimem_v4(FuncSum, half, add, f16x2, u32) - DEFINE_Apply_LoadMultimem_v4(FuncMin, half, min, f16x2, u32) - DEFINE_Apply_LoadMultimem_v4(FuncMax, half, max, f16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32) #if defined(__CUDA_BF16_TYPES_EXIST__) - DEFINE_Apply_LoadMultimem_v4(FuncSum, __nv_bfloat16, add, bf16x2, u32) - DEFINE_Apply_LoadMultimem_v4(FuncMin, __nv_bfloat16, min, bf16x2, u32) - DEFINE_Apply_LoadMultimem_v4(FuncMax, __nv_bfloat16, max, bf16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32) + DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32) #endif +#else + template + struct LoadMultimem_BigPackSize { + static constexpr int BigPackSize = 0; + }; #endif #undef DEFINE_Apply_LoadMultimem #undef DEFINE_Apply_LoadMultimem_v4 +#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf #undef SIZEOF_BytePack_field_u64 #undef PTX_REG_BytePack_field_u64 #undef SIZEOF_BytePack_field_u32 diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index cf4278485c..6a7caeba37 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -30,7 +30,7 @@ namespace { const ssize_t size = args->count; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -113,19 +113,19 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args); + prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, + args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0); } } else if (tid < tidEndReduce) { - int group = (3*Proto::MaxGroupWidth) | (1<<16); - // Reduce through MC + // Reduce through NVLS Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args); + prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, + args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index abb71b7a6d..030b1af7c4 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -15,7 +15,7 @@ template struct RunWork { template - __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); @@ -58,9 +58,8 @@ struct RunWork { } #endif - ReduceOrCopyMulti + reduceCopy (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count); - #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), @@ -80,7 +79,7 @@ struct RunWork { if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; int const peer = args->peer; Primitives, 0, Proto, 1> prims - (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group); + (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex); #if defined(ENABLE_NPKIT) if (isNpKitThread) { @@ -114,7 +113,7 @@ struct RunWork { } template - __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { #if defined(ENABLE_NPKIT) bool isNpKitThread = (tid == 0); int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1; @@ -142,7 +141,7 @@ struct RunWork { if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize int const peer = args->peer; Primitives, 0, Proto, 1> prims - (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group); + (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex); #if defined(ENABLE_NPKIT) if (isNpKitThread) { @@ -189,11 +188,10 @@ struct RunWork { // warpStarts were rounded thanks to int division, but for group number we need to round the other way around // So we mirror wid then mirror again the group. #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) - int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; + uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; args += group; tid -= args->warpStart * WARP_SIZE; int nthreads = args->nWarps * WARP_SIZE; - group |= (args->connIndex<<16); // Used to select connIndex 1 if (args->p2pType == ncclWorkP2pTypeUnused) return; if (tid >= nthreads || args->peer == -1) return; diff --git a/src/debug.cc b/src/debug.cc index 560c1d26a0..b88fa5982a 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -74,6 +74,8 @@ void ncclDebugInit() { mask = NCCL_ALLOC; } else if (strcasecmp(subsys, "CALL") == 0) { mask = NCCL_CALL; + } else if (strcasecmp(subsys, "PROXY") == 0) { + mask = NCCL_PROXY; } else if (strcasecmp(subsys, "NVLS") == 0) { mask = NCCL_NVLS; } else if (strcasecmp(subsys, "ALL") == 0) { diff --git a/src/enqueue.cc b/src/enqueue.cc index 61bc8169f4..6f3318e7e6 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -167,12 +167,13 @@ static void finishWork(struct ncclWork* work, int WarpSize) { static void appendWorkElemP2p( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, - struct ncclWorkElemP2p const *elem + struct ncclWorkElemP2p const *elem, bool fuseOk ) { constexpr int funcIndex = FUNC_INDEX_P2P; struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); if (q && funcIndex == q->work.header.funcIndex) { + if (!fuseOk) goto NewWork; if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) { for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) { // Can't have multiple elements of the same ncclWork communicate with the @@ -301,7 +302,7 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384); // ensure *nWorkBudget >= 1 upon entry. static ncclResult_t addP2pToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, - bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex + bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex, bool fuseOk ) { struct ncclInfo info = { isSendNotRecv ? ncclFuncSend : ncclFuncRecv, @@ -316,7 +317,7 @@ static ncclResult_t addP2pToPlan( // 1 is connIndex struct ncclConnInfo* conn = isSendNotRecv ? - &comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn; + &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn; info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; struct ncclProxyOp proxyOp = {}; @@ -337,7 +338,7 @@ static ncclResult_t addP2pToPlan( elem.connIndex = connIndex; *nWorkBudget += plan->channels[channelId].nWork; - appendWorkElemP2p(comm, plan, channelId, &elem); + appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk); *nWorkBudget -= plan->channels[channelId].nWork; // Calculate the opCount after appendWorkElemP2p since it will always return @@ -508,7 +509,7 @@ static ncclResult_t scheduleCollTasksToPlan( info.sliceSteps = head->sliceSteps; NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks)); if (nAggOps > 1) { - int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels; + int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels; info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]); info.nChannels = std::max(1, std::min(info.nChannels, maxChannels)); info.algorithm = aggInfo.algorithm; @@ -533,7 +534,7 @@ static ncclResult_t scheduleCollTasksToPlan( NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv)); } - int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels; + int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels; NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp, maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv)); tasks->nTasksColl -= 1; @@ -584,17 +585,22 @@ static ncclResult_t scheduleP2pTasksToPlan( // Try to use all channels int nChannelsMax = comm->p2pnChannelsPerPeer; int nChannelsMin = nChannelsMax; - // Try to use all channels, but one channel per operation. - while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; - // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. - while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; + if (comm->nNodes == 1) { + // Try to use all channels, but one channel per operation. + while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; + // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. + while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; + } + bool fuseOk; + // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries. while (tasks->nTasksP2p != 0) { - for (int i=0; i < nRanks; i++) { + for (int i=0; i < tasks->p2pOrderSteps; i++) { int sendPeer = sendOrder[i]; int recvPeer = recvOrder[i]; - struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue); - struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue); + if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false; + struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL; + struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL; if (sendPeer == comm->rank) { if (recvPeer != comm->rank) { WARN("Sendrecv plan not aligned for self"); @@ -639,7 +645,8 @@ static ncclResult_t scheduleP2pTasksToPlan( if (recvChunkBytes != 0) { if (recvChunkBytes == -1) recvChunkBytes = 0; if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget - NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx)); + NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx, fuseOk)); + fuseOk = true; recvPtr += recvChunkBytes; recvBytes -= recvChunkBytes; recv->chunk += 1; @@ -652,7 +659,8 @@ static ncclResult_t scheduleP2pTasksToPlan( if (sendChunkBytes != 0) { if (sendChunkBytes == -1) sendChunkBytes = 0; if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget - NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx)); + NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx, fuseOk)); + fuseOk = true; sendPtr += sendChunkBytes; sendBytes -= sendChunkBytes; send->chunk += 1; @@ -785,12 +793,12 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla } static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) { - uint64_t collOpCount = comm->collOpCount; + uint64_t collOpCount = comm->sharedRes->collOpCount; // Advance comm's collOpCount by number of colls in this plan. - comm->collOpCount = collOpCount + plan->collOpCount; + comm->sharedRes->collOpCount += plan->collOpCount; for (int c=0; c < plan->channelUbound; c++) { struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue); - uint64_t p2pOpCount = comm->channels[c].p2pOpCount; + uint64_t p2pOpCount = comm->sharedRes->p2pOpCount[c]; uint64_t nextP2pOpCount = p2pOpCount; while (q != nullptr) { struct ncclProxyOp* qNext = q->enqNext; @@ -813,7 +821,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* q = qNext; } // Advance channel's p2pOpCount by number of p2p's in this plan channel. - comm->channels[c].p2pOpCount = nextP2pOpCount; + comm->sharedRes->p2pOpCount[c] = nextP2pOpCount; } return ncclSuccess; } @@ -932,15 +940,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires // at least one of the two streams to be strong-stream. cudaStream_t launchStream = tasks->streams->stream; - NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure); if (tasks->numStreams != 1) { // Create dependency for device stream on user streams. First from extra user // streams to deviceStream. Then deviceStream to first user stream. for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); } else if (tasks->streams->stream != comm->lastStream && comm->lastStream != nullptr) { // Stream changed from last call, create dependency against last NCCL kernel launch CUDACHECK(hipStreamWaitEvent(tasks->streams->stream, comm->doneEvent, 0)); @@ -954,15 +962,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; - NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure); + NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. - if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->hostStream), result, failure); - NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure); + if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); } } @@ -1011,7 +1019,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); if (driverVersion >= 11080) { int compCap = comm->compCap; - unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0; + unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; cudaLaunchConfig_t launchConfig = {0}; cudaLaunchAttribute launchAttrs[3]; @@ -1083,7 +1091,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { // Create dependency for deviceStream on launchStream. We know that deviceStream // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), // so we can say that launchStream subsumes it. - if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); + if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); resume1: // Create dependency for other user streams (skip launch stream) on deviceStream. // Again, the user streams haven't been touched since deviceStream waited on them @@ -1091,13 +1099,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { struct ncclCudaStreamList* sl = tasks->streams->next; tasks->streams = nullptr; // Reset comm->tasks.streams to empty. while (sl != nullptr && tasks->numStreams != 1) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); resume2: sl = sl->next; } tasks->numStreams = 0; // Release device stream as acquired in ncclLaunchPrepare() - NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume3); + NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); resume3:; } return result; @@ -1108,13 +1116,9 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { /*****************************************************************************/ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) { - if (info->comm->collNetSupport > 0) { - // Translate ncclAvg and PreMulSum - ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; - NCCLCHECK(collNetReduceSupport(info->comm, info->datatype, netOp, collNetTypeSupport)); - } else { - *collNetTypeSupport = 0; - } + // Translate ncclAvg and PreMulSum + ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; + *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype]; return ncclSuccess; } @@ -1134,6 +1138,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i for (int a=0; adatatype, info->opFull.op)) continue; + if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue; + if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue; for (int p=0; palgorithm == NCCL_ALGO_NVLS) { + } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { // NVLS should not need more than 16 channels to get peak BW. nc = comm->nvlsChannels; } else { @@ -1185,12 +1191,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else if (info->protocol == NCCL_PROTO_SIMPLE) { - nt += WARP_SIZE; // Extra warp for sync + if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync // More threads or sync warps needed due to split thread model - if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; - if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE; - if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE; - if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS; + if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; } nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; #endif @@ -1234,11 +1237,15 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) { info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; case ncclFuncReduceScatter: case ncclFuncAllGather: + info->pattern = + info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : + ncclPatternRing; break; case ncclFuncAllToAllPivot: info->pattern = ncclPatternRing; break; case ncclFuncAllReduce: info->pattern = info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : + info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : @@ -1258,14 +1265,17 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { case ncclPatternPipelineFrom: case ncclPatternPipelineTo: case ncclPatternCollnetChain: + info->nstepsPerLoop = info->nchunksPerLoop = 1; break; case ncclPatternNvls: - info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; + info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break; case ncclPatternCollnetDirect: info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break; case ncclPatternRing: info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternRingTwice: info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; + case ncclPatternNvlsTree: + info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break; default: WARN("Unknown pattern %d", info->pattern); return ncclInternalError; @@ -1348,13 +1358,22 @@ comp_next: while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_NVLS) { - if (chunkSize > 131072) chunkSize = 131072; + int maxChunkSize = 131072; + if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; - if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; + if ((info->nBytes < (64 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { + // Use uint64_t so that concurrentOps*chunkSize*X does not overflow + uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; + if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; + if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; + if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; + if ((info->nBytes < (1 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; @@ -1383,8 +1402,7 @@ comp_next: proxyOp->chunkSize = chunkSize; proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; - proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet - info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum + proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum info->op; proxyOp->pattern = info->pattern; proxyOp->root = info->root; @@ -1502,20 +1520,20 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf int channelId; NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); if (isSendNotRecv) { - if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1UL<p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) { + if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) { comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1UL<p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) { + if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) { comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; ncclIntruQueueEnqueue(&tasks->collQueue, t); - tasks->collBytesTotal += t->count*ncclTypeSize(t->datatype); + tasks->collBytesTotal += info->nBytes; tasks->nTasksColl += 1; } } @@ -1611,10 +1629,10 @@ exit: NCCLCHECK(ncclGroupEndInternal()); /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change * so we have to check state here. */ - if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; + if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; return ret; fail: - if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret); + if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret); goto exit; } diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 617ea78e5f..82a51e77ad 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -15,15 +15,10 @@ /********************* Internode connection ***********************/ /******************************************************************/ -ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, - struct ncclTopoRanks* topoRanks) { +ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; + int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; - int localRanks = 0; - for (int i=0; itopo->nodes[GPU].count; i++) { - localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu; - } for (int c=0; cchannels+c; @@ -39,9 +34,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, for (int i=0; icollnetDirect.up[i] = -1; for (int i=0; icollnetDirect.down[i] = -1; - int* ringIntra = ringGraph->intra+c*localRanks; - int* treeIntra = treeGraph->intra+c*localRanks; - int* collNetIntra = collNetGraph->intra+c*localRanks; + int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks; + int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks; + int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks; + int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks; for (int i=0; ipattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; - int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; + int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; + int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; topoRanks->treeToParent[c] = treeIntra[parentIndex]; topoRanks->treeToChild0[c] = treeIntra[child0Index]; @@ -68,6 +64,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, } topoRanks->ringPrev[c] = channel->ring.prev; topoRanks->ringNext[c] = channel->ring.next; + topoRanks->nvlsHeads[c] = nvlsIntra[0]; } // Duplicate channels rings/trees struct ncclChannel* channel0 = comm->channels; @@ -79,10 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph) { int nChannels = comm->nChannels; - int localRanks = 0; - for (int i=0; itopo->nodes[GPU].count; i++) { - localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu; - } + int localRanks = comm->topo->nodes[GPU].count; //new tree for (int c=0; cintra+c%3*localRanks; @@ -120,26 +114,26 @@ ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, return ncclSuccess; } -static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) { +static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) { int nChannels = comm->nChannels; int nNodes = comm->nNodes; for (int c=0; cnRanks; - int* send = ringSend+c*comm->nRanks; + int* recv = ringRecv+c*comm->nNodes; + int* send = ringSend+c*comm->nNodes; int* prev = ringPrev+c*comm->nRanks; int* next = ringNext+c*comm->nRanks; struct ncclChannel* channel0 = comm->channels+c; struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels; for (int n=0; nrank == recvRank) { channel0->ring.prev = prevSendRank; if (channel1) channel1->ring.prev = prevSendRank; } - int sendRank = send[firstRanks[n]]; - int nextRecvRank = recv[firstRanks[(n+1)%nNodes]]; + int sendRank = send[n]; + int nextRecvRank = recv[(n+1)%nNodes]; next[sendRank] = nextRecvRank; if (comm->rank == sendRank) { channel0->ring.next = nextRecvRank; @@ -152,8 +146,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring return ncclSuccess; } -static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) { - for (int n=0; nnChannels > MAXCHANNELS/2) ? comm->nChannels/2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node; - int* ranksToParent, *ranksToChild0, *ranksToChild1; - NCCLCHECK(ncclCalloc(&ranksToParent, nNodes)); - NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes)); - NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes)); // Compute tree depth. Not an exact value but a good approximation in most // cases int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; + int* ttp, *ttc0, *ttc1; NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); - if (comm->nChannels <= MAXCHANNELS/2) { for (int c=0; cchannels+c; struct ncclChannel* channel1 = channel0+nChannels; - NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks)); - if (comm->rank == ranksToParent[node]) { - NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u)); - NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u)); + ttp = treeToParent+c*comm->nNodes; + ttc0 = treeToChild0+c*comm->nNodes; + ttc1 = treeToChild1+c*comm->nNodes; + if (comm->rank == ttp[node]) { + NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); + NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } - if (comm->rank == ranksToChild0[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0)); - NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0)); + if (comm->rank == ttc0[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); + NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } - if (comm->rank == ranksToChild1[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1)); - NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1)); + if (comm->rank == ttc1[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); + NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } - if (comm->rank == ranksToParent[node] || - comm->rank == ranksToChild0[node] || - comm->rank == ranksToChild1[node]) { + if (comm->rank == ttp[node] || + comm->rank == ttc0[node] || + comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } @@ -219,64 +209,63 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* } else { for (int c=0; cchannels+c; - NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks)); - if (comm->rank == ranksToParent[node]) { - NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u)); + ttp = treeToParent+c*comm->nNodes; + ttc0 = treeToChild0+c*comm->nNodes; + ttc1 = treeToChild1+c*comm->nNodes; + if (comm->rank == ttp[node]) { + NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); } - if (comm->rank == ranksToChild0[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0)); + if (comm->rank == ttc0[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); } - if (comm->rank == ranksToChild1[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1)); + if (comm->rank == ttc1[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); } - if (comm->rank == ranksToParent[node] || - comm->rank == ranksToChild0[node] || - comm->rank == ranksToChild1[node]) { - INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); + if (comm->rank == ttp[node] || + comm->rank == ttc0[node] || + comm->rank == ttc1[node]) { + INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); } channel0->tree.depth = depth; } for (int c=nChannels; cchannels+c; - NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks)); - if (comm->rank == ranksToParent[node]) { - NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u)); + ttp = treeToParent+c*comm->nNodes; + ttc0 = treeToChild0+c*comm->nNodes; + ttc1 = treeToChild1+c*comm->nNodes; + if (comm->rank == ttp[node]) { + NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } - if (comm->rank == ranksToChild0[node]) { - NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0)); + if (comm->rank == ttc0[node]) { + NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } - if (comm->rank == ranksToChild1[node]) { - NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1)); + if (comm->rank == ttc1[node]) { + NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } - if (comm->rank == ranksToParent[node] || - comm->rank == ranksToChild0[node] || - comm->rank == ranksToChild1[node]) { - INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); + if (comm->rank == ttp[node] || + comm->rank == ttc0[node] || + comm->rank == ttc1[node]) { + INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } channel1->tree.depth = depth; } } - free(ranksToParent); - free(ranksToChild0); - free(ranksToChild1); return ncclSuccess; } static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) { int rank = comm->rank; int localRanks = comm->localRanks; - int nHeads = collNetGraph->nChannels; + int nHeads = 0; int *heads; - NCCLCHECK(ncclCalloc(&heads, nHeads)); + NCCLCHECK(ncclCalloc(&heads, localRanks)); // Find all head ranks // Head index is always 0 - for (int c=0; cnChannels; c++) { int* collNetIntra = collNetGraph->intra+c*localRanks; - heads[c] = collNetIntra[0]; + int head = collNetIntra[0]; + for (int h=0; hnChannels; c++) { @@ -315,10 +304,96 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* INFO(NCCL_GRAPH, "%s", line); channel->collnetChain.depth = comm->nRanks/comm->nNodes; } + for (int c=0; cnvlsChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; + } free(heads); return ncclSuccess; } +static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) { + int nHeads = nvlsGraph->nChannels; + int headRank = -1; + for (int h=0; hintra[h*comm->localRanks] == comm->rank) headRank = h; + } + + if (nHeads == 0) { + comm->nvlsChannels = 0; + return ncclSuccess; + } + + for (int c=0; cnvlsChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + channel->nvls.nHeads = nHeads; + for (int h=0; hnvls.up[h] = comm->nRanks+1+h; + for (int h=nHeads; hnvls.up[h] = -1; + channel->nvls.down = comm->nRanks+1+headRank; + channel->nvls.out = -1; // NVLS+SHARP not yet implemented. + channel->nvls.headRank = headRank; + channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; + channel->nvls.node = comm->node; + channel->nvls.nNodes = comm->nNodes; + } + if (comm->nNodes == 1) return ncclSuccess; + + // Connect Trees + int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1; + int pc0, pc1; // ignored + NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node, + &tree0Parent, &tree0Child0, &tree0Child1, &pc0, + &tree1Parent, &tree1Child0, &tree1Child1, &pc1)); + + int* heads = NULL; + int treeUp[2] = { -1, -1 }; + int treeDown0[2] = { -1, -1 }; + int treeDown1[2] = { -1, -1 }; + + if (comm->node == 0) { + for (int h=0; hnNodes; + for (int n=0; nnNodes && n<20; n++) { + sprintf(line+strlen(line), " %2d", heads[n]); + } + INFO(NCCL_INIT, "%s", line); + } + } + + // Find the heads where I'm the head rank and retain tree up/down + for (int h=0; hnNodes; + if (heads[comm->node] == comm->rank) { + treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent]; + treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0]; + treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1]; + treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent]; + treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0]; + treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1]; + break; + } + } + // Set prev/next in all channels (NVLS compute channels work + // orthogonally to NVLS search channels). + for (int c=0; cnvlsChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + channel->nvls.treeUp = treeUp[c%2]; + channel->nvls.treeDown[0] = channel->nvls.down; + int ix = 1; + if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2]; + if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2]; + } + + struct ncclNvls* nvls0 = &comm->channels[0].nvls; + struct ncclNvls* nvls1 = &comm->channels[1].nvls; + INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d", + nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp, + nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp); + return ncclSuccess; +} + // Legacy naming NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); @@ -360,33 +435,40 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev return c; } -ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc) { +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) { // Gather data from all ranks - int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1; + int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; int nranks = comm->nRanks; + int nNodes = comm->nNodes; int nChannels = comm->nChannels; - NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS)); - for (int i=0; iringRecv[c]; - ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c]; - ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c]; - ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c]; - treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c]; - treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c]; - treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c]; + NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS)); + for (int c=0; cringRecv[c]; + ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c]; + treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c]; + treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c]; + treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c]; + nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c]; + } + for (int r=0; rringPrev[c]; + ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c]; } } // Connect rings and trees. This should also duplicate the channels. - NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks)); - NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns)); + NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); + NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns)); + NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS])); // Duplicate ringPrev/ringNext for ncclBuildRing if (nChannels <= MAXCHANNELS/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); @@ -400,6 +482,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Setup CollNet if (comm->collNetSupport == 1) { + struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); @@ -408,10 +491,21 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa NCCLCHECK(connectCollNet(comm, collNetGraph)); } + // Use 4 compute channels per search channel to reach peak BW on <8 PPN + if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) { + nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); + } + // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. - nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); - nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(nc, ncclMinNchannels()), ringPrev, ringNext); + if (comm->sharedRes->owner != comm) { + /* child comm #channels cannot exceed top parent #channels. */ + nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels); + nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext); + } else { + nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs); + nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), ringPrev, ringNext); + } // Create rings array and check all is fine NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); @@ -423,6 +517,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa free(treeToParent); free(treeToChild0); free(treeToChild1); + free(nvlsHeads); return ncclSuccess; } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 8fc65f9748..cb2bf81b31 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -273,7 +273,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ struct ncclTopoNode* intermediateNode = path->list[0]->remNode; if (intermediateNode->type == GPU) { intermediateIndex = intermediateNode - system->nodes[GPU].nodes; - if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank[0]; + if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } @@ -409,7 +409,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int if (distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead int proxyRank, g; - NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank[0], netDev, &proxyRank)); + NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank)); NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; distance = proxyGpu->paths[NET][n].type; @@ -489,7 +489,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev); return ncclInternalError; } - *intermediateRank = node->gpu.rank[0]; + *intermediateRank = node->gpu.rank; } else { *intermediateRank = rank; } @@ -563,6 +563,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system)); } + // Set direct paths to NVSwitches. + for (int n=0; nnodes[NVS].count; n++) { + NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system)); + } + // Update path for GPUs when we don't want to / can't use GPU Direct P2P for (int g=0; gnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { @@ -578,10 +583,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm if (comm == NULL) continue; // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM - struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0]; + struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; - struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0]; + struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; int p2p; NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if (p2p == 0) { @@ -589,7 +594,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); if (shm == 0) { // Mark this peer as inaccessible. We'll trim it later. - system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; + system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET; } } } @@ -603,32 +608,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; if (ncclPxnDisable(comm) != 1) { - int pxnGpu = -1; - - for (int p=0; pnodes[GPU].count; p++) { - if (p == g) continue; - + int localGpuIndex; + NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex)); + if (localGpuIndex != g && localGpuIndex != -1) { // PXN = PCI + NVLink. - struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p; + struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... - if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI - peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink - (peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC - gpu->paths[NET][n].type <= PATH_PXB)) // or avoids going through a CPU - continue; - - pxnGpu = p; - - int netDev; - NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev)); - // To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one. - if (netDev == netNode->id) break; - } - if (pxnGpu != -1) { + if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI + peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink + (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC + gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) - NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n)); + NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); } } // Update path when we dont want to / can't use GPU Direct RDMA. @@ -659,16 +652,11 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* domains[g] = g; ids[g] = gpu->id; for (int p=0; ppaths[GPU][p].count > 0) { + if (gpu->paths[GPU][p].type < PATH_NET) { domains[g] = std::min(domains[g], domains[p]); } } - for (int j=0; jgpu.nRanksPerGpu; j++ ) { - if (gpu->gpu.rank[j] == comm->rank) { - myDomain = domains[g]; - break; - } - } + if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; @@ -732,7 +720,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; for (int g = 0; g < system->nodes[GPU].count; g++) { int net; - NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net)); + NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net)); NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); if (!gdr) break; } @@ -742,16 +730,12 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* INFO(NCCL_GRAPH, "GDR is available on all GPUs"); } } - if (rcclParamEnableIntranet()) { remove = 0; system->type |= RCCL_TOPO_FORCE_INTRA; } - comm->localRanks = 0; - for (int n=0; nnodes[GPU].count; n++ ) { - comm->localRanks += system->nodes[GPU].nodes[n].gpu.nRanksPerGpu; - } - if (comm->localRanks == comm->nRanks && remove) { + comm->localRanks = system->nodes[GPU].count; + if (system->nodes[GPU].count == comm->nRanks && remove) { for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } @@ -808,8 +792,14 @@ static int nextPow2(int v) { ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { /* here we already honor comm->max/minCTAs for p2pnChannels. */ - comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); - comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); + if (comm->sharedRes->owner != comm) { + comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); + comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels); + } else { + comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); + comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); + } + int minChannels = comm->p2pnChannels; // We need to loop through all local GPUs to have a global picture for (int g=0; gtopo->nodes[GPU].count; g++) { @@ -857,14 +847,10 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr int nvbGpus = 0; for (int g=0; gnodes[GPU].nodes+g; - int j=0; - for ( ; jgpu.nRanksPerGpu; j++ ){ - if (gpu->gpu.rank[j] == rank) break; - } - if ( j == gpu->gpu.nRanksPerGpu ) continue; + if (gpu->gpu.rank != rank) continue; for (int p=0; ppaths[GPU][p].type == PATH_NVB) { - (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank[j]; + (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; } } } diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc index 67e31a793b..f5ce15b168 100644 --- a/src/graph/rome_models.cc +++ b/src/graph/rome_models.cc @@ -691,7 +691,7 @@ ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct n if (g == system->nodes[GPU].nodes[j].gpu.dev) break; if (j < ngpus) - graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank[0]; + graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank; else return ncclInternalError; } @@ -725,7 +725,7 @@ end: if (graph->id == 1) { for (int i=0; inChannels; i++) { int net; - ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], &net); + ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], i, &net); graph->inter[i*2+1] = net; } } @@ -788,7 +788,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str break; if (j < ngpus) { - graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank[0]; + graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank; y=r; } else @@ -926,15 +926,15 @@ ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGrap // find the first unsed GPU that is closest to NIC int f, m; for (f = 0; f < ngpus; f++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank[0]) break; + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; if(j >= n) break; } for (int i = 0; i < ngpus; i++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank[0]) break; + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; if (j < n) continue; if (paths[i].count < paths[f].count) f = i; } - for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank[0]) break; + for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; used[n] = graph->intra[n*ngpus+m]; for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; diff --git a/src/graph/search.cc b/src/graph/search.cc index a8f840ce80..45364fb8ba 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -13,6 +13,8 @@ #include #include "rome_models.h" +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); + // Initialize system->maxBw. This is the per-channel (i.e. per-SM) // max bw. static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { @@ -109,15 +111,26 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc if (type1 == -1) return ncclSuccess; struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1; struct ncclTopoLinkList* path = node1->paths[type2]+index2; + struct ncclTopoNode* node2 = system->nodes[type2].nodes+index2; + struct ncclTopoLinkList* revPath = node2->paths[type1]+index1; + + if (path == NULL) { + WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2); + return ncclInternalError; + } if (path->count == 0 ) return ncclSuccess; // Now check link type *node = NULL; - int intra = type1 == GPU && type2 == GPU; + int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS); float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; if (mult == 1 && (path->type > type)) return ncclSuccess; + if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || + graph->pattern == NCCL_TOPO_PATTERN_TREE || + graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) && + (revPath->type > type)) return ncclSuccess; bw *= mult; @@ -186,11 +199,9 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) { static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) { for (int g=0; gnodes[GPU].count; g++) { - for (int j=0; jnodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) { - if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) { - *index = g; - return ncclSuccess; - } + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + *index = g; + return ncclSuccess; } } WARN("Could not find gpu rank %d", rank); @@ -259,7 +270,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second -#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18) +#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16) #define NCCL_SEARCH_TIMEOUT (1<<14) #define NCCL_SEARCH_TIMEOUT_TREE (1<<14) #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8) @@ -272,13 +283,9 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo if (graph->nChannels == 0) return ncclInternalError; int ngpus = system->nodes[GPU].count; int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; - for (int i=0; inodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) { - if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) { - *g = i; - return ncclSuccess; - } - } + for (int i=0; inodes[GPU].nodes[i].gpu.rank == nextRank) { + *g = i; + return ncclSuccess; } if (*g == -1) return ncclInternalError; return ncclSuccess; @@ -308,26 +315,18 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph int n = graph->intra[ngpus*c+((i+1)%ngpus)]; struct ncclTopoNode *node; int j; - for (j=0; jnodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) { - if (system->nodes[GPU].nodes[j].gpu.rank[k] == g) - found = true; - } - if (found) break; - } + for (j=0; jnodes[GPU].nodes[j].gpu.rank == g) break; if (jnodes[GPU].nodes+j; for (int k = 0; knodes[GPU].count; k++) { if (node->paths[GPU][k].count == 1) { struct ncclTopoLink* link = node->paths[GPU][k].list[0]; struct ncclTopoNode* remNode = link->remNode; - for (int l=0; lgpu.nRanksPerGpu; l++) { - if (remNode->gpu.rank[l] == n) { - if (link->type == LINK_NVL) - count ++; - } - } + if (remNode->gpu.rank == n) { + if (link->type == LINK_NVL) + count ++; + } } } } @@ -336,17 +335,57 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph return count; } +ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { + struct ncclTopoNode* nvs; + struct ncclTopoNode* gpu; + int d0=0; // See if there is enough bandwidth for NVS->GPU traffic + do { + NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu)); + d0++; + } while (gpu && d0 < system->nodes[GPU].count); + if (gpu == NULL) { + d0--; + } else { + int d1=0; // See if there is enough bandwidth for GPU->NVS traffic + do { + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs)); + d1++; + } while (nvs && d1 < system->nodes[GPU].count); + if (nvs == NULL) { + d1--; + } else { // Both directions worked. Move on to the next path. + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); + } + while (d1) { + d1--; + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs)); + } + } + while (d0) { + d0--; + NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu)); + } + return ncclSuccess; +} + ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { - // 1. Constraint to get the same nChannels between Rings and Trees + // 1. Try to get the same nChannels between Rings and Trees if (graph->nChannels < graph->minChannels) return ncclSuccess; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better. + if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1; + return ncclSuccess; + } // 2. Try to get better bandwidth - if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess; - if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) { + // Give a 15% perf bonus to paths not crossing nics + float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15; + if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) { *copy = 1; return ncclSuccess; } - // 3. Less hops (but not at the price of going cross NICs) + if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess; + + // 3. Less hops if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; // 4. Prefer graph with more XGMI connections @@ -426,7 +465,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->nChannels--; return ncclSuccess; } - graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank[0]; + graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; if (step == backToNet) { // first get back to NIC @@ -467,6 +506,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo } free(nets); } + } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); } else if (step < system->nodes[GPU].count-1) { // Go to next GPU int next[NCCL_TOPO_MAX_NODES]; @@ -512,7 +553,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo struct ncclTopoNode* gpu; if (graph->collNet && net->net.collSupport == 0) continue; if (net->net.bw < bw) continue; - if (net->net.maxChannels == 0) continue; graph->inter[graph->nChannels*2] = net->id; graph->latencyInter = net->net.latency; @@ -523,59 +563,63 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo system->nodes[NET].nodes[i].net.bw -= bw; } } - net->net.maxChannels--; - // First try to replay the last channel - if (graph->nChannels > 0) { - int g; - NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); - } - if (graph->nChannels == 0 || graph->sameChannels == 0) { - if (graph->nChannels == 0) { - // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long - struct ncclTopoLinkList* paths = net->paths[GPU]; - int f = 0, f_gdr = 0; - // find the first GPU that is closest to NIC - for (int i = 0; inodes[GPU].count; i++) { - if (paths[i].count <= paths[f].count) { - // prefer GPU direct RDMA - int gdr; - NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr)); - if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) { - f = i; - f_gdr = gdr; + // NVLS needs to balance on all NICs + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels])); + } else { + if (graph->nChannels > 0) { + // Try to replay the last channel + int g; + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); + } + if (graph->nChannels == 0 || graph->sameChannels == 0) { + if (graph->nChannels == 0) { + // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long + struct ncclTopoLinkList* paths = net->paths[GPU]; + int f = 0, f_gdr = 0; + // find the first GPU that is closest to NIC + for (int i = 0; inodes[GPU].count; i++) { + if (paths[i].count <= paths[f].count) { + // prefer GPU direct RDMA + int gdr; + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr)); + if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) { + f = i; + f_gdr = gdr; + } } } + int t = 1 << 10; + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); + if (t == -1) *time = -1; } - int t = 1 << 10; - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f)); - if (t == -1) *time = -1; - } - // Then try the most local GPUs - float maxBw = 0; - int minHops = 0xfffffff; - struct ncclTopoLinkList* paths = net->paths[GPU]; - for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].bw > maxBw) { - maxBw = paths[g].bw; - minHops = paths[g].count; - } else if (paths[g].bw == maxBw && paths[g].count < minHops) { - minHops = paths[g].count; + // Then try the most local GPUs + float maxBw = 0; + int minHops = 0xfffffff; + struct ncclTopoLinkList* paths = net->paths[GPU]; + for (int g=0; gnodes[GPU].count; g++) { + if (paths[g].bw > maxBw) { + maxBw = paths[g].bw; + minHops = paths[g].count; + } else if (paths[g].bw == maxBw && paths[g].count < minHops) { + minHops = paths[g].count; + } } - } - if (maxBw >= bw) { - // In the first loop, avoid using GPUs in both directions between channels (one channel - // sending from that GPU and one channel receiving to that GPU), since that usually leads - // to lower BW. - for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { - for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].bw == maxBw && paths[g].count == minHops) { - gpu = system->nodes[GPU].nodes+g; - int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; - if (tryGpuBidir == gpuUsed) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); + if (maxBw >= bw) { + // In the first loop, avoid using GPUs in both directions between channels (one channel + // sending from that GPU and one channel receiving to that GPU), since that usually leads + // to lower BW. + for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { + for (int g=0; gnodes[GPU].count; g++) { + if (paths[g].bw == maxBw && paths[g].count == minHops) { + gpu = system->nodes[GPU].nodes+g; + int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; + if (tryGpuBidir == gpuUsed) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); + } } } } @@ -583,7 +627,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } - net->net.maxChannels++; for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { @@ -634,7 +677,10 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { // Intra-node only. - if (graph->nChannels == 0) { + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels)); + return ncclSuccess; + } else if (graph->nChannels == 0) { // Try PCI order first NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); } else { @@ -683,7 +729,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st } else if (strcmp(sub->name, "gpu") == 0) { int rank = -1; for (int g=0; gnodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank[0]; + if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank; } if (rank == -1) { WARN("XML Import Channel : dev %d not found.", dev); @@ -701,7 +747,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc int crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic)); - if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess; + if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess; graph->crossNic = crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); @@ -744,9 +790,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node)); int dev = -1; for (int i=0; inodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) { - if (system->nodes[GPU].nodes[i].gpu.rank[j] == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev; - } + if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev; } if (dev == -1) { WARN("XML Export Channel : rank %d not found.", intra[g]); @@ -795,50 +839,39 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) +#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) #else -float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 }; -float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; -#endif +float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 }; +float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) -RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); -NCCL_PARAM(CrossNic, "CROSS_NIC", 2); +float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 }; +float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) +#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) +#endif -static void ncclExpandMultiRank(ncclTopoSystem* system, struct ncclTopoGraph* graph) -{ - // Expand the intra array to the multi-ranks per node scenario - int ngpus = system->nodes[GPU].count; - int intraCpy[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; - TRACE(NCCL_GRAPH, "TopoCompute: expanding intra array for multi-rank per GPU scenarios nChannels %d", graph->nChannels); - memcpy(intraCpy, graph->intra, ngpus*sizeof(int)*graph->nChannels); - int tk=0; - for (int n=0; nnChannels; n++ ) { - for (int i=0; inodes[GPU].nodes[j].gpu.rank[0] ) { - for (int k=0; knodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) { - graph->intra[tk++] = system->nodes[GPU].nodes[j].gpu.rank[k]; - } - } - } - } - } -} +RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; graph->crossNic = ncclParamCrossNic(); - int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; + int crossNic = (system->nodes[NET].count > 1) && graph->crossNic && + (graph->pattern == NCCL_TOPO_PATTERN_RING || + graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || + graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; graph->nChannels = 0; - graph->sameChannels = 1; graph->nIntraChannels = 0; memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int)); + int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1; + graph->sameChannels = trySameChannels; char* str = getenv("NCCL_GRAPH_FILE"); if (str) { @@ -850,10 +883,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels)); INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels); free(xml); - if (graph->nChannels > 0) { - ncclExpandMultiRank(system, graph); - return ncclSuccess; - } + if (graph->nChannels > 0) return ncclSuccess; } str = getenv("NCCL_RINGS"); @@ -866,29 +896,17 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } else if (!rcclParamModelMatchingDisable() && !graph->collNet) { // try to match 8P6L NCCLCHECK(parseChordalRing(system, graph)); - if (graph->nChannels) { - ncclExpandMultiRank(system, graph); - return ncclSuccess; - } + if (graph->nChannels) return ncclSuccess; // try to match Rome 4P2H NCCLCHECK(parseRome4P2H(system, graph)); - if (graph->nChannels) { - ncclExpandMultiRank(system, graph); - return ncclSuccess; - } + if (graph->nChannels) return ncclSuccess; // try to match 1H16P NCCLCHECK(parse1H16P(system, graph)); - if (graph->nChannels) { - ncclExpandMultiRank(system, graph); - return ncclSuccess; - } + if (graph->nChannels) return ncclSuccess; // try to match 4H4P NCCLCHECK(parse4H4P(system, graph)); } - if (graph->nChannels) { - ncclExpandMultiRank(system, graph); - return ncclSuccess; - } + if (graph->nChannels) return ncclSuccess; if ((graph->pattern == NCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) { // limit single node max channels when searching ring graph on Rome @@ -898,6 +916,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int ccMin; NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; + + if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; + + if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + // Force intra-node NVLS algorithm to pull evenly from all GPUs. + graph->minChannels = graph->maxChannels = system->nodes[GPU].count; + } struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); @@ -914,7 +940,10 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } int pass = 1; int speedIndex = 0; - while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++; + float maxBw = system->maxBw; + float totalBw = system->totalBw; + if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1); + while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; @@ -948,12 +977,19 @@ search: tmpGraph.sameChannels = 0; goto search; } - tmpGraph.sameChannels = 1; + tmpGraph.sameChannels = trySameChannels; if (time != -1) globalTimeout += time; else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; if (globalTimeout < 0 && graph->nChannels) goto done; + // Try a simpler tree + if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; + goto search; + } + tmpGraph.pattern = graph->pattern; + int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; @@ -974,20 +1010,13 @@ search: } tmpGraph.crossNic = 0; - // Try a simpler tree - if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { - tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; - goto search; - } - tmpGraph.pattern = graph->pattern; - // Decrease bw until we find a solution if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) { tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex]; goto search; } speedIndex = 0; - while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++; + while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; } @@ -1016,24 +1045,26 @@ done: memcpy(&tmpGraph, graph, sizeof(tmpGraph)); } - if (graph->nChannels == 0 && graph->collNet == 0) { + if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) { WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); - for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank[0]; + for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->bwIntra = graph->bwInter = 0.1; graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } - if (graph->bwIntra >= 25.0) { - int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); - memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); - memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); - graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); - graph->bwInter /= DIVUP(dupChannels, graph->nChannels); - graph->nChannels = dupChannels; - } - ncclExpandMultiRank(system, graph); + if (graph->nChannels == 0) return ncclSuccess; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; + if (graph->bwIntra < 25.0) return ncclSuccess; + if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; + + int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); + memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); + memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); + graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); + graph->bwInter /= DIVUP(dupChannels, graph->nChannels); + graph->nChannels = dupChannels; return ncclSuccess; } @@ -1085,23 +1116,40 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru return ncclSuccess; } +#include "comm.h" +// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head +ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) { + int localRanks = comm->topo->nodes[GPU].count; + for (int c=0; cnChannels; c++) { + if (graph->intra[c*localRanks] == comm->rank) { + *dev = graph->inter[c*2]; + return ncclSuccess; + } + } + WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank); + return ncclInternalError; +} + // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); -#include "comm.h" ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) { if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel*ngpus] == rank ? 0 : 1; - *dev = graph->inter[channel*2+index]; + if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) { + *dev = graph->inter[channel*2+index]; + } else { + NCCLCHECK(getNvlsNetDev(comm, graph, dev)); + } NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } else if (peerRank == -1) { return ncclInternalError; } else { // Start with our local NIC and local Rank - NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev)); + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev)); *proxyRank = rank; int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel(); @@ -1111,7 +1159,9 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG int cudaDev = comm->peerInfo[peerRank].cudaDev; int localRank; if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess; - int netDev = comm->peerInfo[localRank].netDev; + int netDev; + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev)); + int n; // Check that device exists on our node if (ncclParamCrossNic() == 0) { @@ -1131,20 +1181,17 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } } else if (pxnLevel == 2) { - // Check whether we can access it through our node-local GPU for that NIC. - for (int r=0; rlocalRanks; r++) { - int peerRank = comm->localRankToRank[r]; - if (comm->peerInfo[peerRank].netDev == netDev) { - int g1, g2, n; - NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); - NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2)); - NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); - struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; - if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { - *proxyRank = peerRank; - *dev = netDev; - return ncclSuccess; - } + // Check which local GPU corresponds to that NIC and see if we can use PXN. + int n, g1, g2; + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); + NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2)); + if (g2 != -1) { + struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; + if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { + *proxyRank = peerGpu->gpu.rank; + *dev = netDev; + return ncclSuccess; } } } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index d41293f7dc..bd4c75310f 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -117,10 +117,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo n->links[0].remNode = n; n->links[0].bw = LOC_BW; n->gpu.dev = NCCL_TOPO_UNDEF; - for (int i=0; igpu.rank[i] = NCCL_TOPO_UNDEF; - } - n->gpu.nRanksPerGpu = NCCL_TOPO_UNDEF; + n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; } else if (type == CPU) { n->cpu.arch = NCCL_TOPO_UNDEF; @@ -256,15 +253,7 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { if (node->type == GPU) { - sprintf(line+offset, "%s/%lX (%d", topoNodeTypeStr[node->type], node->id, node->gpu.rank[0]); - int nextOffset; - int nextRank = 1; - while ( nextRank < node->gpu.nRanksPerGpu ) { - nextOffset = strlen(line); - sprintf(line+nextOffset, "/%d", node->gpu.rank[nextRank++]); - } - nextOffset = strlen(line); - sprintf(line+nextOffset, ")"); + sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); } else if (node->type == CPU) { sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); } else if (node->type == PCI) { @@ -384,17 +373,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s rcclHipDeviceArch_t arch; NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value)); memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t)); - - //NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); - const char *rankStr; - NCCLCHECK(xmlGetAttrStr(xmlGpu, "rank", &rankStr)); - char *tmpStr; - char *token = strtok_r ( (char *)rankStr, ",", &tmpStr); - gpu->gpu.nRanksPerGpu = 0; - while (token != NULL && gpu->gpu.nRanksPerGpu < RCCL_TOPO_MAX_RANKS_PER_GPU) { - gpu->gpu.rank[gpu->gpu.nRanksPerGpu++] = atoi(token); - token = strtok_r(NULL, ",", &tmpStr); - } + NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); // Do not go any further, nvlinks will be added in a second pass @@ -406,7 +385,6 @@ struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane - ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { const char* str; @@ -716,8 +694,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); if (node == NULL) continue; NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); - //NCCLCHECK(xmlSetAttrInt(node, "rank", r)); - NCCLCHECK(xmlSetOrAppendAttrInt(node, "rank", r)); + NCCLCHECK(xmlSetAttrInt(node, "rank", r)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); } } @@ -744,11 +721,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy } } if (netDevCount == 0) { - NCCLCHECK(ncclNetDevices(comm, &netDevCount)); + NCCLCHECK(comm->ncclNet->devices(&netDevCount)); } for (int n=0; nncclNet->getProperties(n, &props)); struct ncclXmlNode* netNode; NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode)); NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); @@ -777,10 +754,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy return ncclSuccess; } -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) { - int g; - NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); - int minType = PATH_SYS; +static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) { + int minType = PATH_DIS; float maxBw = 0; int count = 0; int* nets; @@ -790,20 +765,115 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { maxBw = path->bw; minType = path->type; + if (type) *type = minType; count = 0; } if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } - if (count == 0) { - *id = -1; - free(nets); + + *localNetMask = 0ULL; + for (int n=0; n= 64) return ncclInternalError; + *localNetMask |= 1ULL<nodes[GPU].count; + NCCLCHECK(ncclCalloc(&localNetMasks, ngpus)); + + // Fill localNetMasks for all GPUs. + for (int g=0; gnodes[GPU].count; + int* gpus; + NCCLCHECK(ncclCalloc(&gpus, ngpus)); + + // Find localNetMask which includes net with the most local GPUs. + int netLocalGpus = 0, minType = PATH_DIS; + uint64_t localNetMask = 0ULL; + for (int g=0; gnodes[GPU].nodes[g].gpu.dev; - *id = nets[rr%count]; - free(nets); - return ncclSuccess; + // Round robin on GPUs and channels + int gIndex = 0, cId = 0, n = 0; + while (1) { + if (1ULL << n & localNetMask) { + if (n == net) { + *gpuIndex = gpus[gIndex]; + free(gpus); + return ncclSuccess; + } + gIndex++; + if (gIndex == netLocalGpus) { + gIndex = 0; + cId++; + } + } + n = (n+1) % 64; + } } /****************************/ @@ -822,20 +892,18 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; for (int g=0; gnodes[GPU].count; g++) { - for (int j=0; jnodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) { - if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) { - gpu = system->nodes[GPU].nodes+g; - // Find closer CPU - int cpuIndex = -1, minHops = 0; - for (int c=0; cnodes[CPU].count; c++) { - int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; - if (cpuIndex == -1 || nHops < minHops) { - cpuIndex = c; - minHops = nHops; - } - } - cpu = system->nodes[CPU].nodes+cpuIndex; + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + gpu = system->nodes[GPU].nodes+g; + // Find closer CPU + int cpuIndex = -1, minHops = 0; + for (int c=0; cnodes[CPU].count; c++) { + int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; + if (cpuIndex == -1 || nHops < minHops) { + cpuIndex = c; + minHops = nHops; + } } + cpu = system->nodes[CPU].nodes+cpuIndex; } } if (cpu == NULL) { @@ -885,6 +953,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu return ncclSuccess; } +ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) { + *count = system->nodes[GPU].count; + return ncclSuccess; +} + ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NET].count; return ncclSuccess; @@ -910,11 +983,9 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { for (int g=0; gnodes[GPU].count; g++) { - for ( int j=0; jnodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){ - if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) { - *localRank = g; - return ncclSuccess; - } + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + *localRank = g; + return ncclSuccess; } } WARN("Could not find local GPU with rank %d", rank); diff --git a/src/graph/topo.h b/src/graph/topo.h index 72a294837c..cc995d4ccf 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -13,12 +13,13 @@ #define LOC_BW 5000.0 #define SM60_NVLINK_BW 18.0 -#define SM70_NVLINK_BW 22.0 -#define SM80_NVLINK_BW 22.0 +#define SM70_NVLINK_BW 20.0 +#define SM80_NVLINK_BW 20.0 +#define SM90_NVLINK_BW 20.0 #define SM86_NVLINK_BW 12.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 -#define SKL_QPI_BW 9.0 +#define SKL_QPI_BW 10.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 #define P9_BW 32.0 @@ -75,7 +76,12 @@ extern const char* topoLinkTypeStr[]; // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) #define PATH_SYS 7 -#define PATH_DIS 7 + +// Connection through the network +#define PATH_NET 8 + +// Disconnected +#define PATH_DIS 9 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -106,7 +112,6 @@ struct ncclTopoLinkList { #define RCCL_TOPO_FORCE_INTRA 16 #define RCCL_TOPO_XGMI_ALL 32 -#define RCCL_TOPO_MAX_RANKS_PER_GPU 8 struct ncclTopoNode { int type; int64_t id; @@ -114,8 +119,7 @@ struct ncclTopoNode { union { struct { int dev; // NVML dev number - int rank[RCCL_TOPO_MAX_RANKS_PER_GPU]; - int nRanksPerGpu; + int rank; int cudaCompCap; int gdrSupport; int gcn; @@ -198,11 +202,9 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) { *index = -1; for (int i=0; inodes[GPU].count; i++) { - for (int j=0; jnodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) { - if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) { - *index = i; - return ncclSuccess; - } + if (system->nodes[GPU].nodes[i].gpu.rank == rank) { + *index = i; + return ncclSuccess; } } return ncclInternalError; @@ -212,7 +214,7 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in *rank = -1; for (int i=0; inodes[GPU].count; i++) { if (system->nodes[GPU].nodes[i].gpu.dev == dev) { - *rank = system->nodes[GPU].nodes[i].gpu.rank[0]; + *rank = system->nodes[GPU].nodes[i].gpu.rank; return ncclSuccess; } } diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 1a6d04c772..7ab30f3575 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -54,7 +54,10 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } }; +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { + { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring + { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain + { 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -71,18 +74,18 @@ struct tuningModel { static struct tuningModel tuning_model_0 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .treeCorrectionFactor = { @@ -101,18 +104,18 @@ static struct tuningModel tuning_model_0 { static struct tuningModel tuning_model_1 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .treeCorrectionFactor = { @@ -131,18 +134,18 @@ static struct tuningModel tuning_model_1 { static struct tuningModel tuning_model_2 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .treeCorrectionFactor = { @@ -161,18 +164,18 @@ static struct tuningModel tuning_model_2 { static struct tuningModel tuning_model_3 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .treeCorrectionFactor = { @@ -191,18 +194,18 @@ static struct tuningModel tuning_model_3 { static struct tuningModel tuning_model_4 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 } }, + { /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, }, .treeCorrectionFactor = { @@ -232,21 +235,42 @@ static struct tuningModel rcclTuningModel[] = { #define HOPPER_COMPCAP_IDX 2 // LL128 max BW per channel -static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 }; static const double llMaxBws[3][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} }; +static const double perChMaxRingLL128Bws[3][3] = { + /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, + /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, + /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, +}; +static const double perChMaxTreeLL128Bws[3][3] = { + /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, + /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, + /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, +}; static const double perChMaxTreeBws[3][3] = { - /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, + /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, - /* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0}, + /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, }; -ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { - int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; +// Network post overhead in ns (1000 = 1 us) +NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2); + +static float getNetOverhead(struct ncclComm* comm) { + if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001; + int cpuArch, cpuVendor, cpuModel; + NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); + if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; + if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; + else return 1.0; +} + +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { + int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize); @@ -262,7 +286,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = - comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; + comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = + comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = @@ -281,11 +306,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; + double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; + double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring //if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount - struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ }; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; abwIntra : graphs[a]->bwInter; float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw; @@ -315,13 +344,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom else busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p]; #else - if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f); - if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f); if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels); + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); - if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels); + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); + if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { @@ -331,12 +359,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom busBw /= factor; } #endif - if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75; + if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85; // Convert bus BW to algorithm BW float ratio; if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps; - else if (a == NCCL_ALGO_NVLS) ratio = .75; + else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0; + else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1)); else ratio = .5; comm->bandwidths[coll][a][p] = busBw * ratio; @@ -344,16 +373,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p]; float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; //if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; + if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; + if (a == NCCL_ALGO_RING) { float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { - if (ringGraph->sameChannels) { + if (graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { if (p == NCCL_PROTO_SIMPLE) lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps*lat; } } else { + // Inter-node rings still have to launch nsteps * net overhead. + float netOverhead = 0.0; + if (nNodes > 1) { + netOverhead = getNetOverhead(comm); + if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; + } + intraLat = std::max(intraLat, netOverhead); comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { @@ -363,7 +401,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency } else if (a == NCCL_ALGO_COLLNET_CHAIN) { - comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat; + comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; + } else if (a == NCCL_ALGO_NVLS) { + if (nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; + } else if (a == NCCL_ALGO_NVLS_TREE) { + comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; } } } @@ -372,7 +414,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; - int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); if (protoStr) { @@ -385,15 +427,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); } - // Disable NVLink SHARP if not supported - if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0; + if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0; // Disable CollNet if it is not supported if (comm->collNetSupport == 0) { algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; + if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0; // If user has hard set NCCL_ALGO=COLLNET, ignore it - if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) { + if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && + algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) { algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET"); } @@ -415,7 +458,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption. pEnable = 1; pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); - pEnable &= (graphs[a]->typeIntra <= PATH_NVL); + pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); switch (minCompCap) { case 70: pEnable &= 1; break; @@ -433,28 +476,38 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (comm->rank == 0) { char line[1024]; - sprintf(line, "Latency/AlgBw |"); - for (int a=0; amaxThreads[a][p]); - } - } - INFO(NCCL_TUNING, "%s", line); - for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); + sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]); } } INFO(NCCL_TUNING, "%s", line); + sprintf(line, " Max NThreads |"); + for (int ba=0; bamaxThreads[a][p]); + } + } + INFO(NCCL_TUNING, "%s", line); + for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); + } + } + INFO(NCCL_TUNING, "%s", line); + } } } @@ -514,7 +567,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 - && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring + && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) { + lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring + } #endif // Tree pipelining saves latency in aggregation cases int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 4780de4eec..138d48e401 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -789,8 +789,8 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl } NCCLCHECK(ncclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode)); #else - nvmlDevice_t nvmlDev = NULL; - if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL; + nvmlDevice_t nvmlDev; + NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); #endif return ncclSuccess; diff --git a/src/graph/xml.h b/src/graph/xml.h index 64f0b4cb8f..5ffa6c90c5 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -178,25 +178,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName return ncclSuccess; } -static ncclResult_t xmlSetOrAppendAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { - int index; - NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); - if (index == -1) { - index = node->nAttrs++; - strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); - node->attrs[index].key[MAX_STR_LEN] = '\0'; - snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); - node->attrs[index].value[MAX_STR_LEN] = '\0'; - return ncclSuccess; - } - char *tmp = strdup(node->attrs[index].value); - snprintf(node->attrs[index].value, MAX_STR_LEN, "%s,%d", tmp, value); - node->attrs[index].value[MAX_STR_LEN] = '\0'; - free (tmp); - return ncclSuccess; -} - - static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); diff --git a/src/group.cc b/src/group.cc index 477b34ed32..b266654be9 100644 --- a/src/group.cc +++ b/src/group.cc @@ -45,13 +45,14 @@ ncclResult_t ncclAsyncLaunch( job->undo = undo; job->destructor = destructor; job->abortFlag = comm->abortFlag; + job->childAbortFlag = comm->childAbortFlag; job->state = ncclGroupJobRunning; job->comm = comm; /* check if there are blocking and nonblocking comms at the same time in group. */ if (ncclGroupBlocking == -1) { /* first met communicator */ - ncclGroupBlocking = comm->blocking; - } else if (ncclGroupBlocking != comm->blocking) { + ncclGroupBlocking = comm->config.blocking; + } else if (ncclGroupBlocking != comm->config.blocking) { WARN("Blocking and nonblocking communicators are not allowed in the same group."); ret = ncclInvalidArgument; } @@ -87,23 +88,20 @@ ncclResult_t ncclGroupStart() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); - /* if previous group launch does not complete, don't launch this one. */ - if (ncclGroupJobMainPtr != NULL) { - if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) { - ret = ncclInvalidUsage; - goto exit; - } else { - NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit); - } - } NCCLCHECK(ncclGroupStartInternal()); TRACE_CALL("ncclGroupStart()"); - -exit: return ret; } -ncclResult_t ncclGroupStartInternal() { +inline ncclResult_t ncclGroupStartInternal() { + /* if previous group launch does not complete, don't launch this one. */ + if (ncclGroupJobMainPtr != NULL) { + if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) { + return ncclInvalidUsage; + } else { + NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr)); + } + } ncclGroupDepth++; if (mscclAvailable() && !mscclIsCaller()) { NCCLCHECK(mscclGroupStart()); @@ -204,13 +202,6 @@ failure: return result; } -static inline void groupResetJobState() { - ncclGroupBlocking = -1; - ncclGroupJobMainPtr = NULL; - memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); - return; -} - static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) { struct ncclComm* comm = *groupCommHeadPtr; @@ -255,7 +246,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); } - if (!comm->blocking) + if (!comm->config.blocking) (void) ncclCommSetAsyncError(comm, error); comm = next; } @@ -264,7 +255,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g while (!ncclIntruQueueEmpty(asyncJobsPtr)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); *job->abortFlag = 1; - if (job->comm && !job->comm->blocking) + if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, error); if (job->undo) job->undo(job); if (job->destructor) job->destructor((void*)job); @@ -339,6 +330,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { if (*groupAbortFlag == true || errorJobAbortFlag == true) { *job->abortFlag = 1; + if (job->childAbortFlag) *job->childAbortFlag = 1; } job = job->next; @@ -359,7 +351,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); - if (job->comm && !job->comm->blocking) + if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } @@ -368,7 +360,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); - if (!comm->blocking) { + if (!comm->config.blocking) { (void) ncclCommSetAsyncError(comm, ret); } groupCommHeadMain = next; @@ -449,15 +441,6 @@ fail: goto exit; } -static ncclResult_t groupJobComplete(struct ncclGroupJob* job) { - ncclResult_t ret = ncclSuccess; - if (job) { - ret = ncclAsyncJobComplete(&job->base); - groupResetJobState(); - } - return ret; -} - void ncclGroupJobAbort() { ncclGroupJobAbortFlag = true; (void) groupJobComplete(ncclGroupJobMainPtr); diff --git a/src/include/align.h b/src/include/align.h index e3780fe52c..2a71dd1bc3 100644 --- a/src/include/align.h +++ b/src/include/align.h @@ -13,6 +13,9 @@ #define ROUNDUP(x, y) \ (DIVUP((x), (y))*(y)) +#define ALIGN_POWER(x, y) \ + ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) + #define ALIGN_SIZE(size, align) \ size = ((size + (align) - 1) / (align)) * (align); diff --git a/src/include/alloc.h b/src/include/alloc.h index 262d0cbb9e..4f47be44d9 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -12,6 +12,7 @@ #include "checks.h" #include "align.h" #include "utils.h" +#include "p2p.h" #include #include #include @@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be #define MAX_ALLOC_TRACK_NGPU 32 extern struct allocationTracker allocTracker[]; +#if CUDART_VERSION >= 11030 + +#include +#include "cudawrap.h" + +static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { + ncclResult_t result = ncclSuccess; + size_t granularity = 0; + CUdevice currentDev; + CUmemAllocationProp prop = {}; + CUmemAccessDesc accessDesc = {}; + CUmemGenericAllocationHandle handle; + int cudaDev; + int flag = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported + prop.location.id = currentDev; + // Query device to see if RDMA support is available + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); + if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + ALIGN_SIZE(size, granularity); + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &prop, 0)); + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + /* Now allow RW access to the newly mapped memory */ + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = currentDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + if (handlep) *handlep = handle; + TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle); + return result; +} + +static inline ncclResult_t ncclCuMemFree(void *ptr) { + if (ptr == NULL) return ncclSuccess; + ncclResult_t result = ncclSuccess; + CUmemGenericAllocationHandle handle; + size_t size = 0; + CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); + CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + return result; +} + +#else + +extern int ncclCuMemEnable(); + +static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} +static inline ncclResult_t ncclCuMemFree(void *ptr) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} + +#endif + template ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) { ncclResult_t result = ncclSuccess; @@ -193,8 +265,13 @@ template ncclResult_t ncclCudaFree(T* ptr) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaFree(ptr), result, finish); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish); + } else { + CUDACHECKGOTO(cudaFree(ptr), result, finish); + } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 2ecea7a94f..400a479fbe 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm); +ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); diff --git a/src/include/channel.h b/src/include/channel.h index 0ebb5a2734..adc38749a5 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -9,7 +9,9 @@ #include "comm.h" ncclResult_t initChannel(struct ncclComm* comm, int channelid); -ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); +ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); +ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; int peerNode = comm->rankToNode[peer]; diff --git a/src/include/checks.h b/src/include/checks.h index 048fc06e9b..c9fd16176e 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -18,11 +18,11 @@ } \ } while(false) -#define CUDACHECKGOTO(cmd, res, label) do { \ +#define CUDACHECKGOTO(cmd, RES, label) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - res = ncclUnhandledCudaError; \ + RES = ncclUnhandledCudaError; \ goto label; \ } \ } while(false) @@ -60,11 +60,11 @@ } \ } while(true) -#define SYSCHECKGOTO(statement, res, label) do { \ +#define SYSCHECKGOTO(statement, RES, label) do { \ if ((statement) == -1) { \ /* Print the back trace*/ \ - res = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); @@ -72,16 +72,16 @@ #define NEQCHECK(statement, value) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); -#define NEQCHECKGOTO(statement, value, res, label) do { \ +#define NEQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ - res = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); @@ -89,57 +89,57 @@ #define EQCHECK(statement, value) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); -#define EQCHECKGOTO(statement, value, res, label) do { \ +#define EQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ - res = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); // Propagate errors up #define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess && res != ncclInProgress) { \ + ncclResult_t RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ + return RES; \ } \ } while (0); -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess && res != ncclInProgress) { \ +#define NCCLCHECKGOTO(call, RES, label) do { \ + RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ } while (0); #define NCCLWAIT(call, cond, abortFlagPtr) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - ncclResult_t res = call; \ - if (res != ncclSuccess && res != ncclInProgress) { \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + ncclResult_t RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return ncclInternalError; \ } \ if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ } while (!(cond)); -#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \ +#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - res = call; \ - if (res != ncclSuccess && res != ncclInProgress) { \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ - if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ + if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ } while (!(cond)); #define NCCLCHECKTHREAD(a, args) do { \ diff --git a/src/include/collectives.h b/src/include/collectives.h index 0fb2badb66..bda4be7f71 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -63,11 +63,12 @@ struct ncclDevRedOpFull { MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type)) #define DECL3(func, devredop, type, undef) \ - DECL4(func, RING, devredop, type, undef) \ - DECL4(func, TREE, devredop, type, undef) \ + DECL4(func, RING, devredop, type, undef) \ + DECL4(func, TREE, devredop, type, undef) \ DECL4(func, COLLNET_DIRECT, devredop, type, undef) \ - DECL4(func, COLLNET_CHAIN, devredop, type, undef) \ - DECL4(func, NVLS, devredop, type, undef) + DECL4(func, COLLNET_CHAIN, devredop, type, undef) \ + DECL4(func, NVLS, devredop, type, undef) \ + DECL4(func, NVLS_TREE, devredop, type, undef) #if defined(RCCL_BFLOAT16) #define DECL2(func, devredop, undefForFloat) \ diff --git a/src/include/comm.h b/src/include/comm.h index dac5cc8f53..2062de3fca 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -101,19 +101,51 @@ struct ncclCommCallback { ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); }; +struct ncclSharedResources { + int refCount; + struct ncclComm* owner; /* comm which creates this shared res. */ + struct ncclChannelPeer* peers[MAXCHANNELS]; + struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; + /* P2P operation counter, one per channel */ + uint64_t p2pOpCount[MAXCHANNELS]; + /* Collective operation counter */ + uint64_t collOpCount; + int tpNRanks; + int tpNLocalRanks; + int tpNChannels; + int tpP2pNChannels; + int tpP2pChunkSize; + uint64_t magic; + + // top parent rank to localRank translation table + int* tpRankToLocalRank; + // Internal streams + struct ncclStrongStream deviceStream, hostStream; + + /* proxy related shared res */ + struct ncclProxyState* proxyState; +}; + struct ncclChannel { - struct ncclChannelPeer* peers; - struct ncclDevChannelPeer* devPeers; + struct ncclChannelPeer** peers; + struct ncclDevChannelPeer** devPeers; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; + struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclTree binTree; struct ncclNvls nvls; + int id; // index of this channel uint32_t workFifoSent; // last used work index+1 - uint64_t p2pOpCount; + + /* comm split sharable resources */ + struct ncclChannelPeer* collnetPeers; + struct ncclDevChannelPeer* collnetDevPeers; + struct ncclChannelPeer* nvlsPeers; + struct ncclDevChannelPeer* nvlsDevPeers; }; struct ncclWorkList { @@ -167,6 +199,10 @@ struct ncclComm { // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; + struct ncclSharedResources* sharedRes; + /* map to top parent ranks. */ + int* topParentRanks; + int* topParentLocalRanks; struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; @@ -180,15 +216,16 @@ struct ncclComm { uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. + uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index + //int nvmlDev; // my nvml device index int compCap; // compute capability of the GPU - int minCompCap; // min compute capability in the communicator + int minCompCap, maxCompCap; // min/max compute capability in the communicator int64_t busId; // my PCI bus ID in int format cpu_set_t cpuAffinity; // CPU affinity of the GPU int WarpSize; - int virtualId; int cudaArch; // matches __CUDA_ARCH__ of device int node; @@ -207,12 +244,11 @@ struct ncclComm { // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; - // Collective operation counter - uint64_t collOpCount; // Channels for collectives int nChannels; int nvlsChannels; + int collNetChannels; // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; @@ -237,6 +273,8 @@ struct ncclComm { // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; + volatile uint32_t *childAbortFlag; + uint32_t *abortFlagRefCount; // Flags for enable P2P NET uint32_t p2pNet; @@ -268,21 +306,24 @@ struct ncclComm { char intraPad2[64 - sizeof(uint64_t)]; uint64_t intraBarrierGate; // only used if this is intraComm0 - struct ncclProxyState proxyState; - + struct ncclProxyState* proxyState; + int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet int collNetSupport; + uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes]; int intraHighestTransportType; + int* collNetHeads; + int collNetHeadsNum; + /* sharable collNet proxy progress resource. */ + struct ncclCollNetSharedRes* collNetSharedRes; // NVLink SHARP (NVLS) support int nvlsSupport; - void* nvlsResources; + /* sharable NVLS resource. */ + struct ncclNvlsSharedRes* nvlsResources; size_t channelSize; // User requested work size (bytes) for channel partitions - // Internal streams - struct ncclStrongStream deviceStream, hostStream; - // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; @@ -319,13 +360,7 @@ struct ncclComm { volatile bool collTraceExit; #endif - // communicator mode - int blocking; - // CGA cluster size - int cgaClusterSize; - int minCTAs, maxCTAs; - // network interface name - char *netName; + ncclConfig_t config; // initState is to more conveniently reclaim resources when errors happen. ncclResult_t initState; // flag to indicate if ncclCommFinalize() is called diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index 317ca2df6d..da9ce45a4f 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -11,6 +11,9 @@ #include #include "checks.h" +// Is cuMem API usage enabled +extern int ncclCuMemEnable(); + #if CUDART_VERSION >= 11030 #include #else @@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000); DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020); #if CUDA_VERSION >= 11070 diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 1c80a3f32c..49a32f148c 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -21,12 +21,13 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2]; -#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 #define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 @@ -112,10 +113,10 @@ struct ncclConnInfo { }; struct ncclProxyConnector { - int rank; - int localRank; + int tpRank; + int tpLocalRank; + int sameProcess; struct ncclProxyConnection* connection; - struct ncclComm* comm; }; struct ncclConnector { @@ -124,7 +125,6 @@ struct ncclConnector { struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; - struct ncclComm *comm; }; struct ncclRing { @@ -141,6 +141,9 @@ struct ncclRing { }; +// The root of each tree only has one node down (+1 intra-node). +#define NCCL_MAX_TREE_ARITY_TOP 2 +// Nodes inside the binary tree can have to two nodes down (+1 intra-node). #define NCCL_MAX_TREE_ARITY 3 struct ncclTree { int depth; @@ -161,18 +164,24 @@ struct ncclDirect { #define NCCL_CONN_IDX_P2P_NET 2 #define NCCL_MAX_NVLS_ARITY 8 +#define NCCL_MAX_NVLS_TREE_ARITY 3 struct ncclNvls { int out; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) int up[NCCL_MAX_NVLS_ARITY]; int down; + int treeUp; + int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; + int node; + int nNodes; }; #define NCCL_MAX_CONNS 3 struct ncclChannelPeer { struct ncclConnector send[NCCL_MAX_CONNS]; struct ncclConnector recv[NCCL_MAX_CONNS]; + int refCount; }; struct ncclDevComm; @@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must #endif struct alignas(16) ncclDevChannel { - struct ncclDevChannelPeer *peers; + struct ncclDevChannelPeer** peers; struct ncclRing ring; struct ncclTree tree; struct ncclTree collnetChain; diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h index c83a2292ae..f532a705e1 100644 --- a/src/include/gdrwrap.h +++ b/src/include/gdrwrap.h @@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); - CUDACHECK(cudaFree(md->gdrDevMem)); + NCCLCHECK(ncclCudaFree(md->gdrDevMem)); free(md); return ncclSuccess; diff --git a/src/include/graph.h b/src/include/graph.h index 38b17d5113..69726e08de 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_TYPE_ROME 4 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); -ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id); +ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id); +ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex); #define NCCL_TOPO_MAX_NODES 256 @@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU #define NCCL_TOPO_PATTERN_RING 4 // Ring +#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 @@ -108,18 +111,16 @@ struct ncclTopoRanks { int treeToParent[MAXCHANNELS]; int treeToChild0[MAXCHANNELS]; int treeToChild1[MAXCHANNELS]; + int nvlsHeads[MAXCHANNELS]; }; -ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, - struct ncclTopoRanks* topoRanks); +ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, - struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc); - + struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc); ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph); -ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); #include "info.h" ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time); diff --git a/src/include/group.h b/src/include/group.h index dd0e642d22..a92c348863 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -36,6 +36,7 @@ struct ncclAsyncJob { void(*destructor)(void*); ncclGroupJobState_t state; volatile uint32_t *abortFlag; /* point to comm abortFlag */ + volatile uint32_t *childAbortFlag; /* point to child abortFlag */ ncclComm_t comm; }; @@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError; extern __thread struct ncclComm* ncclGroupCommHead; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; extern __thread int ncclGroupBlocking; +extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; +extern __thread struct ncclGroupJob ncclGroupJobMain; + +static inline void groupResetJobState() { + ncclGroupBlocking = -1; + ncclGroupJobMainPtr = NULL; + memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); + return; +} + +static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { + ncclResult_t ret = ncclSuccess; + if (job) { + ret = ncclAsyncJobComplete(&job->base); + groupResetJobState(); + } + return ret; +} inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { @@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) { ncclMemoryStackPush(&comm->memScoped); } - ncclGroupBlocking = comm->blocking; + ncclGroupBlocking = comm->config.blocking; } // Add comm to this thread's group needing preconnect diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h new file mode 100644 index 0000000000..00a6b6f60b --- /dev/null +++ b/src/include/ibvcore.h @@ -0,0 +1,1043 @@ +#ifndef NCCL_IBV_CORE_H_ +#define NCCL_IBV_CORE_H_ + +/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without + * explicit including of IB verbs header. + */ + +#include +#include +#include +#include + +#if __GNUC__ >= 3 +# define __attribute_const __attribute__((const)) +#else +# define __attribute_const +#endif + +union ibv_gid { + uint8_t raw[16]; + struct { + uint64_t subnet_prefix; + uint64_t interface_id; + } global; +}; + +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) \ + ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) +#endif + +#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) + +/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ +//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; + +enum ibv_node_type { + IBV_NODE_UNKNOWN = -1, + IBV_NODE_CA = 1, + IBV_NODE_SWITCH, + IBV_NODE_ROUTER, + IBV_NODE_RNIC, + + /* Leave a gap for future node types before starting with + * experimental node types. + */ + IBV_EXP_NODE_TYPE_START = 32, + IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START +}; + +enum ibv_transport_type { + IBV_TRANSPORT_UNKNOWN = -1, + IBV_TRANSPORT_IB = 0, + IBV_TRANSPORT_IWARP, + + /* Leave a gap for future transport types before starting with + * experimental transport types. + */ + IBV_EXP_TRANSPORT_TYPE_START = 32, + IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START +}; + +enum ibv_device_cap_flags { + IBV_DEVICE_RESIZE_MAX_WR = 1, + IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, + IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, + IBV_DEVICE_RAW_MULTI = 1 << 3, + IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, + IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, + IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, + IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, + IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, + IBV_DEVICE_INIT_TYPE = 1 << 9, + IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, + IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, + IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, + IBV_DEVICE_SRQ_RESIZE = 1 << 13, + IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, + IBV_DEVICE_XRC = 1 << 20, + IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 +}; + +enum ibv_atomic_cap { + IBV_ATOMIC_NONE, + IBV_ATOMIC_HCA, + IBV_ATOMIC_GLOB +}; + +struct ibv_device_attr { + char fw_ver[64]; + uint64_t node_guid; + uint64_t sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; +}; + +enum ibv_mtu { + IBV_MTU_256 = 1, + IBV_MTU_512 = 2, + IBV_MTU_1024 = 3, + IBV_MTU_2048 = 4, + IBV_MTU_4096 = 5 +}; + +enum ibv_port_state { + IBV_PORT_NOP = 0, + IBV_PORT_DOWN = 1, + IBV_PORT_INIT = 2, + IBV_PORT_ARMED = 3, + IBV_PORT_ACTIVE = 4, + IBV_PORT_ACTIVE_DEFER = 5 +}; + +enum { + IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, + IBV_LINK_LAYER_ETHERNET, + + /* Leave a gap for future link layer types before starting with + * experimental link layer. + */ + IBV_EXP_LINK_LAYER_START = 32, + IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START +}; + +enum ibv_port_cap_flags { + IBV_PORT_SM = 1 << 1, + IBV_PORT_NOTICE_SUP = 1 << 2, + IBV_PORT_TRAP_SUP = 1 << 3, + IBV_PORT_OPT_IPD_SUP = 1 << 4, + IBV_PORT_AUTO_MIGR_SUP = 1 << 5, + IBV_PORT_SL_MAP_SUP = 1 << 6, + IBV_PORT_MKEY_NVRAM = 1 << 7, + IBV_PORT_PKEY_NVRAM = 1 << 8, + IBV_PORT_LED_INFO_SUP = 1 << 9, + IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, + IBV_PORT_CM_SUP = 1 << 16, + IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IBV_PORT_REINIT_SUP = 1 << 18, + IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, + IBV_PORT_VENDOR_CLASS = 1 << 24, + IBV_PORT_CLIENT_REG_SUP = 1 << 25, + IBV_PORT_IP_BASED_GIDS = 1 << 26, +}; + +struct ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t reserved; +}; + +enum ibv_event_type { + IBV_EVENT_CQ_ERR, + IBV_EVENT_QP_FATAL, + IBV_EVENT_QP_REQ_ERR, + IBV_EVENT_QP_ACCESS_ERR, + IBV_EVENT_COMM_EST, + IBV_EVENT_SQ_DRAINED, + IBV_EVENT_PATH_MIG, + IBV_EVENT_PATH_MIG_ERR, + IBV_EVENT_DEVICE_FATAL, + IBV_EVENT_PORT_ACTIVE, + IBV_EVENT_PORT_ERR, + IBV_EVENT_LID_CHANGE, + IBV_EVENT_PKEY_CHANGE, + IBV_EVENT_SM_CHANGE, + IBV_EVENT_SRQ_ERR, + IBV_EVENT_SRQ_LIMIT_REACHED, + IBV_EVENT_QP_LAST_WQE_REACHED, + IBV_EVENT_CLIENT_REREGISTER, + IBV_EVENT_GID_CHANGE, + + /* new experimental events start here leaving enough + * room for 14 events which should be enough + */ + IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, + IBV_EXP_EVENT_DCT_ACCESS_ERR, + IBV_EXP_EVENT_DCT_REQ_ERR, +}; + +struct ibv_async_event { + union { + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_srq *srq; + struct ibv_exp_dct *dct; + int port_num; + /* For source compatible with Legacy API */ + uint32_t xrc_qp_num; + } element; + enum ibv_event_type event_type; +}; + +enum ibv_wc_status { + IBV_WC_SUCCESS, + IBV_WC_LOC_LEN_ERR, + IBV_WC_LOC_QP_OP_ERR, + IBV_WC_LOC_EEC_OP_ERR, + IBV_WC_LOC_PROT_ERR, + IBV_WC_WR_FLUSH_ERR, + IBV_WC_MW_BIND_ERR, + IBV_WC_BAD_RESP_ERR, + IBV_WC_LOC_ACCESS_ERR, + IBV_WC_REM_INV_REQ_ERR, + IBV_WC_REM_ACCESS_ERR, + IBV_WC_REM_OP_ERR, + IBV_WC_RETRY_EXC_ERR, + IBV_WC_RNR_RETRY_EXC_ERR, + IBV_WC_LOC_RDD_VIOL_ERR, + IBV_WC_REM_INV_RD_REQ_ERR, + IBV_WC_REM_ABORT_ERR, + IBV_WC_INV_EECN_ERR, + IBV_WC_INV_EEC_STATE_ERR, + IBV_WC_FATAL_ERR, + IBV_WC_RESP_TIMEOUT_ERR, + IBV_WC_GENERAL_ERR +}; +const char *ibv_wc_status_str(enum ibv_wc_status status); + +enum ibv_wc_opcode { + IBV_WC_SEND, + IBV_WC_RDMA_WRITE, + IBV_WC_RDMA_READ, + IBV_WC_COMP_SWAP, + IBV_WC_FETCH_ADD, + IBV_WC_BIND_MW, +/* + * Set value of IBV_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IBV_WC_RECV). + */ + IBV_WC_RECV = 1 << 7, + IBV_WC_RECV_RDMA_WITH_IMM +}; + +enum ibv_wc_flags { + IBV_WC_GRH = 1 << 0, + IBV_WC_WITH_IMM = 1 << 1 +}; + +struct ibv_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + int wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; +}; + +enum ibv_access_flags { + IBV_ACCESS_LOCAL_WRITE = 1, + IBV_ACCESS_REMOTE_WRITE = (1<<1), + IBV_ACCESS_REMOTE_READ = (1<<2), + IBV_ACCESS_REMOTE_ATOMIC = (1<<3), + IBV_ACCESS_MW_BIND = (1<<4), + IBV_ACCESS_RELAXED_ORDERING = (1<<20), +}; + +struct ibv_pd { + struct ibv_context *context; + uint32_t handle; +}; + +enum ibv_xrcd_init_attr_mask { + IBV_XRCD_INIT_ATTR_FD = 1 << 0, + IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, + IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_xrcd_init_attr { + uint32_t comp_mask; + int fd; + int oflags; +}; + +struct ibv_xrcd { + struct ibv_context *context; +}; + +enum ibv_rereg_mr_flags { + IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), + IBV_REREG_MR_CHANGE_PD = (1 << 1), + IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), + IBV_REREG_MR_KEEP_VALID = (1 << 3) +}; + +struct ibv_mr { + struct ibv_context *context; + struct ibv_pd *pd; + void *addr; + size_t length; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; +}; + +enum ibv_mw_type { + IBV_MW_TYPE_1 = 1, + IBV_MW_TYPE_2 = 2 +}; + +struct ibv_mw { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t rkey; +}; + +struct ibv_global_route { + union ibv_gid dgid; + uint32_t flow_label; + uint8_t sgid_index; + uint8_t hop_limit; + uint8_t traffic_class; +}; + +struct ibv_grh { + uint32_t version_tclass_flow; + uint16_t paylen; + uint8_t next_hdr; + uint8_t hop_limit; + union ibv_gid sgid; + union ibv_gid dgid; +}; + +enum ibv_rate { + IBV_RATE_MAX = 0, + IBV_RATE_2_5_GBPS = 2, + IBV_RATE_5_GBPS = 5, + IBV_RATE_10_GBPS = 3, + IBV_RATE_20_GBPS = 6, + IBV_RATE_30_GBPS = 4, + IBV_RATE_40_GBPS = 7, + IBV_RATE_60_GBPS = 8, + IBV_RATE_80_GBPS = 9, + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18 +}; + +/** + * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; + +/** + * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. + * @mult: multiple to convert. + */ +enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; + +/** + * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. + * For example, IBV_RATE_5_GBPS will return the value 5000. + * @rate: rate to convert. + */ +int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; + +/** + * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. + * @mbps: value to convert. + */ +enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; + +struct ibv_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; +}; + +enum ibv_srq_attr_mask { + IBV_SRQ_MAX_WR = 1 << 0, + IBV_SRQ_LIMIT = 1 << 1 +}; + +struct ibv_srq_attr { + uint32_t max_wr; + uint32_t max_sge; + uint32_t srq_limit; +}; + +struct ibv_srq_init_attr { + void *srq_context; + struct ibv_srq_attr attr; +}; + +enum ibv_srq_type { + IBV_SRQT_BASIC, + IBV_SRQT_XRC +}; + +enum ibv_srq_init_attr_mask { + IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, + IBV_SRQ_INIT_ATTR_PD = 1 << 1, + IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, + IBV_SRQ_INIT_ATTR_CQ = 1 << 3, + IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_srq_init_attr_ex { + void *srq_context; + struct ibv_srq_attr attr; + + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + struct ibv_cq *cq; +}; + +enum ibv_qp_type { + IBV_QPT_RC = 2, + IBV_QPT_UC, + IBV_QPT_UD, + /* XRC compatible code */ + IBV_QPT_XRC, + IBV_QPT_RAW_PACKET = 8, + IBV_QPT_RAW_ETH = 8, + IBV_QPT_XRC_SEND = 9, + IBV_QPT_XRC_RECV, + + /* Leave a gap for future qp types before starting with + * experimental qp types. + */ + IBV_EXP_QP_TYPE_START = 32, + IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START +}; + +struct ibv_qp_cap { + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t max_recv_sge; + uint32_t max_inline_data; +}; + +struct ibv_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + /* Below is needed for backwards compatabile */ + struct ibv_xrc_domain *xrc_domain; +}; + +enum ibv_qp_init_attr_mask { + IBV_QP_INIT_ATTR_PD = 1 << 0, + IBV_QP_INIT_ATTR_XRCD = 1 << 1, + IBV_QP_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_qp_init_attr_ex { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + + uint32_t comp_mask; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; +}; + +enum ibv_qp_open_attr_mask { + IBV_QP_OPEN_ATTR_NUM = 1 << 0, + IBV_QP_OPEN_ATTR_XRCD = 1 << 1, + IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, + IBV_QP_OPEN_ATTR_TYPE = 1 << 3, + IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_qp_open_attr { + uint32_t comp_mask; + uint32_t qp_num; + struct ibv_xrcd *xrcd; + void *qp_context; + enum ibv_qp_type qp_type; +}; + +enum ibv_qp_attr_mask { + IBV_QP_STATE = 1 << 0, + IBV_QP_CUR_STATE = 1 << 1, + IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, + IBV_QP_ACCESS_FLAGS = 1 << 3, + IBV_QP_PKEY_INDEX = 1 << 4, + IBV_QP_PORT = 1 << 5, + IBV_QP_QKEY = 1 << 6, + IBV_QP_AV = 1 << 7, + IBV_QP_PATH_MTU = 1 << 8, + IBV_QP_TIMEOUT = 1 << 9, + IBV_QP_RETRY_CNT = 1 << 10, + IBV_QP_RNR_RETRY = 1 << 11, + IBV_QP_RQ_PSN = 1 << 12, + IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, + IBV_QP_ALT_PATH = 1 << 14, + IBV_QP_MIN_RNR_TIMER = 1 << 15, + IBV_QP_SQ_PSN = 1 << 16, + IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, + IBV_QP_PATH_MIG_STATE = 1 << 18, + IBV_QP_CAP = 1 << 19, + IBV_QP_DEST_QPN = 1 << 20 +}; + +enum ibv_qp_state { + IBV_QPS_RESET, + IBV_QPS_INIT, + IBV_QPS_RTR, + IBV_QPS_RTS, + IBV_QPS_SQD, + IBV_QPS_SQE, + IBV_QPS_ERR, + IBV_QPS_UNKNOWN +}; + +enum ibv_mig_state { + IBV_MIG_MIGRATED, + IBV_MIG_REARM, + IBV_MIG_ARMED +}; + +struct ibv_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + int qp_access_flags; + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; +}; + +enum ibv_wr_opcode { + IBV_WR_RDMA_WRITE, + IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_WR_SEND, + IBV_WR_SEND_WITH_IMM, + IBV_WR_RDMA_READ, + IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_WR_ATOMIC_FETCH_AND_ADD +}; + +enum ibv_send_flags { + IBV_SEND_FENCE = 1 << 0, + IBV_SEND_SIGNALED = 1 << 1, + IBV_SEND_SOLICITED = 1 << 2, + IBV_SEND_INLINE = 1 << 3 +}; + +struct ibv_sge { + uint64_t addr; + uint32_t length; + uint32_t lkey; +}; + +struct ibv_send_wr { + uint64_t wr_id; + struct ibv_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + int send_flags; + uint32_t imm_data; /* in network byte order */ + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; + union { + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + + uint32_t xrc_remote_srq_num; + }; +}; + +struct ibv_recv_wr { + uint64_t wr_id; + struct ibv_recv_wr *next; + struct ibv_sge *sg_list; + int num_sge; +}; + +struct ibv_mw_bind { + uint64_t wr_id; + struct ibv_mr *mr; + void *addr; + size_t length; + int send_flags; + int mw_access_flags; +}; + +struct ibv_srq { + struct ibv_context *context; + void *srq_context; + struct ibv_pd *pd; + uint32_t handle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; + + /* below are for source compatabilty with legacy XRC, + * padding based on ibv_srq_legacy. + */ + uint32_t xrc_srq_num_bin_compat_padding; + struct ibv_xrc_domain *xrc_domain_bin_compat_padding; + struct ibv_cq *xrc_cq_bin_compat_padding; + void *ibv_srq_padding; + + /* legacy fields */ + uint32_t xrc_srq_num; + struct ibv_xrc_domain *xrc_domain; + struct ibv_cq *xrc_cq; +}; + +/* Not in use in new API, needed for compilation as part of source compat layer */ +enum ibv_event_flags { + IBV_XRC_QP_EVENT_FLAG = 0x80000000, +}; + + + +struct ibv_qp { + struct ibv_context *context; + void *qp_context; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +struct ibv_comp_channel { + struct ibv_context *context; + int fd; + int refcnt; +}; + +struct ibv_cq { + struct ibv_context *context; + struct ibv_comp_channel *channel; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; +}; + +struct ibv_ah { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t handle; +}; + +enum ibv_flow_flags { + IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, + IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, +}; + +enum ibv_flow_attr_type { + /* steering according to rule specifications */ + IBV_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_MC_DEFAULT = 0x2, +}; + +enum ibv_flow_spec_type { + IBV_FLOW_SPEC_ETH = 0x20, + IBV_FLOW_SPEC_IPV4 = 0x30, + IBV_FLOW_SPEC_TCP = 0x40, + IBV_FLOW_SPEC_UDP = 0x41, +}; + +struct ibv_flow_eth_filter { + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + uint16_t ether_type; + /* + * same layout as 802.1q: prio 3, cfi 1, vlan id 12 + */ + uint16_t vlan_tag; +}; + +struct ibv_flow_spec_eth { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_eth_filter val; + struct ibv_flow_eth_filter mask; +}; + +struct ibv_flow_ipv4_filter { + uint32_t src_ip; + uint32_t dst_ip; +}; + +struct ibv_flow_spec_ipv4 { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv4_filter val; + struct ibv_flow_ipv4_filter mask; +}; + +struct ibv_flow_tcp_udp_filter { + uint16_t dst_port; + uint16_t src_port; +}; + +struct ibv_flow_spec_tcp_udp { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_tcp_udp_filter val; + struct ibv_flow_tcp_udp_filter mask; +}; + +struct ibv_flow_spec { + union { + struct { + enum ibv_flow_spec_type type; + uint16_t size; + } hdr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_ipv4 ipv4; + struct ibv_flow_spec_tcp_udp tcp_udp; + }; +}; + +struct ibv_flow_attr { + uint32_t comp_mask; + enum ibv_flow_attr_type type; + uint16_t size; + uint16_t priority; + uint8_t num_of_specs; + uint8_t port; + uint32_t flags; + /* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx [L2] + * struct ibv_flow_spec_yyy [L3/L4] + */ +}; + +struct ibv_flow { + uint32_t comp_mask; + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_device; +struct ibv_context; + +struct ibv_device_ops { + struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); + void (*free_context)(struct ibv_context *context); +}; + +enum { + IBV_SYSFS_NAME_MAX = 64, + IBV_SYSFS_PATH_MAX = 256 +}; + +struct ibv_device { + struct ibv_device_ops ops; + enum ibv_node_type node_type; + enum ibv_transport_type transport_type; + /* Name of underlying kernel IB device, eg "mthca0" */ + char name[IBV_SYSFS_NAME_MAX]; + /* Name of uverbs device, eg "uverbs0" */ + char dev_name[IBV_SYSFS_NAME_MAX]; + /* Path to infiniband_verbs class device in sysfs */ + char dev_path[IBV_SYSFS_PATH_MAX]; + /* Path to infiniband class device in sysfs */ + char ibdev_path[IBV_SYSFS_PATH_MAX]; +}; + +struct verbs_device { + struct ibv_device device; /* Must be first */ + size_t sz; + size_t size_of_context; + int (*init_context)(struct verbs_device *device, + struct ibv_context *ctx, int cmd_fd); + void (*uninit_context)(struct verbs_device *device, + struct ibv_context *ctx); + /* future fields added here */ +}; + +struct ibv_context_ops { + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + struct ibv_pd * (*alloc_pd)(struct ibv_context *context); + int (*dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, + int access); + struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, + int access); + int (*dereg_mr)(struct ibv_mr *mr); + struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); + int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + int (*dealloc_mw)(struct ibv_mw *mw); + struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); + int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); + void (*cq_event)(struct ibv_cq *cq); + int (*resize_cq)(struct ibv_cq *cq, int cqe); + int (*destroy_cq)(struct ibv_cq *cq); + struct ibv_srq * (*create_srq)(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + int (*modify_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + int (*query_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr); + int (*destroy_srq)(struct ibv_srq *srq); + int (*post_srq_recv)(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); + int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + int (*destroy_qp)(struct ibv_qp *qp); + int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); + int (*destroy_ah)(struct ibv_ah *ah); + int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + void (*async_event)(struct ibv_async_event *event); +}; + +struct ibv_context { + struct ibv_device *device; + struct ibv_context_ops ops; + int cmd_fd; + int async_fd; + int num_comp_vectors; + pthread_mutex_t mutex; + void *abi_compat; +}; + +enum verbs_context_mask { + VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, + VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, + VERBS_CONTEXT_QP = (uint64_t)1 << 2, + VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, + VERBS_CONTEXT_EXP = (uint64_t)1 << 62 +}; + +struct verbs_context { + /* "grows up" - new fields go here */ + int (*_reserved_2) (void); + int (*destroy_flow) (struct ibv_flow *flow); + int (*_reserved_1) (void); + struct ibv_flow * (*create_flow) (struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + struct ibv_qp * (*open_qp)(struct ibv_context *context, + struct ibv_qp_open_attr *attr); + struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + uint64_t has_comp_mask; + size_t sz; /* Must be immediately before struct ibv_context */ + struct ibv_context context;/* Must be last field in the struct */ +}; + +/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ +/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) +{ + return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? + NULL : container_of(ctx, struct verbs_context, context); +} + +#define verbs_get_ctx_op(ctx, op) ({ \ + struct verbs_context *_vctx = verbs_get_ctx(ctx); \ + (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ + !_vctx->op) ? NULL : _vctx; })*/ + +#define verbs_set_ctx_op(_vctx, op, ptr) ({ \ + struct verbs_context *vctx = _vctx; \ + if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ + vctx->op = ptr; }) + +static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) +{ + return (dev->ops.alloc_context) ? + NULL : container_of(dev, struct verbs_device, device); +} + +static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { + return qp->context->ops.post_send(qp, wr, bad_wr); +} + +#endif // NCCL_IBV_CORE_H_ diff --git a/src/include/ibvsymbols.h b/src/include/ibvsymbols.h new file mode 100644 index 0000000000..7cf1e08d8c --- /dev/null +++ b/src/include/ibvsymbols.h @@ -0,0 +1,44 @@ +#ifndef NCCL_IBV_SYMBOLS_H_ +#define NCCL_IBV_SYMBOLS_H_ + +#ifdef NCCL_BUILD_RDMA_CORE +#include +#else +#include "ibvcore.h" +#endif + +#include "nccl.h" + +/* IB Verbs Function Pointers*/ +struct ncclIbvSymbols { + int (*ibv_internal_fork_init)(void); + struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); + void (*ibv_internal_free_device_list)(struct ibv_device **list); + const char * (*ibv_internal_get_device_name)(struct ibv_device *device); + struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); + int (*ibv_internal_close_device)(struct ibv_context *context); + int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); + void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); + int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); + int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); + int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); + int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); + struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); + int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); + struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); + /* DMA-BUF support */ + struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); + int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); + struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); + int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); + struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); + int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); + int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); + const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); +}; + +/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ +ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); + +#endif // NCCL_IBV_SYMBOLS_H_ diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index c7475890a7..d1c7d08e71 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -12,1044 +12,23 @@ #ifndef NCCL_IBVWRAP_H_ #define NCCL_IBVWRAP_H_ +#ifdef NCCL_BUILD_RDMA_CORE +#include +#else +#include "ibvcore.h" +#endif + #include "core.h" #include #include -// Dynamically handle dependencies on IB verbs - -#if __GNUC__ >= 3 -# define __attribute_const __attribute__((const)) -#else -# define __attribute_const -#endif - -union ibv_gid { - uint8_t raw[16]; - struct { - uint64_t subnet_prefix; - uint64_t interface_id; - } global; -}; - -#ifndef container_of -/** - * container_of - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - */ -#define container_of(ptr, type, member) \ - ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) -#endif - -#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) - -/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; - -enum ibv_node_type { - IBV_NODE_UNKNOWN = -1, - IBV_NODE_CA = 1, - IBV_NODE_SWITCH, - IBV_NODE_ROUTER, - IBV_NODE_RNIC, - - /* Leave a gap for future node types before starting with - * experimental node types. - */ - IBV_EXP_NODE_TYPE_START = 32, - IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START -}; - -enum ibv_transport_type { - IBV_TRANSPORT_UNKNOWN = -1, - IBV_TRANSPORT_IB = 0, - IBV_TRANSPORT_IWARP, - - /* Leave a gap for future transport types before starting with - * experimental transport types. - */ - IBV_EXP_TRANSPORT_TYPE_START = 32, - IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START -}; - -enum ibv_device_cap_flags { - IBV_DEVICE_RESIZE_MAX_WR = 1, - IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, - IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, - IBV_DEVICE_RAW_MULTI = 1 << 3, - IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, - IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, - IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, - IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, - IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, - IBV_DEVICE_INIT_TYPE = 1 << 9, - IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, - IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, - IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, - IBV_DEVICE_SRQ_RESIZE = 1 << 13, - IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, - IBV_DEVICE_XRC = 1 << 20, - IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 -}; - -enum ibv_atomic_cap { - IBV_ATOMIC_NONE, - IBV_ATOMIC_HCA, - IBV_ATOMIC_GLOB -}; - -struct ibv_device_attr { - char fw_ver[64]; - uint64_t node_guid; - uint64_t sys_image_guid; - uint64_t max_mr_size; - uint64_t page_size_cap; - uint32_t vendor_id; - uint32_t vendor_part_id; - uint32_t hw_ver; - int max_qp; - int max_qp_wr; - int device_cap_flags; - int max_sge; - int max_sge_rd; - int max_cq; - int max_cqe; - int max_mr; - int max_pd; - int max_qp_rd_atom; - int max_ee_rd_atom; - int max_res_rd_atom; - int max_qp_init_rd_atom; - int max_ee_init_rd_atom; - enum ibv_atomic_cap atomic_cap; - int max_ee; - int max_rdd; - int max_mw; - int max_raw_ipv6_qp; - int max_raw_ethy_qp; - int max_mcast_grp; - int max_mcast_qp_attach; - int max_total_mcast_qp_attach; - int max_ah; - int max_fmr; - int max_map_per_fmr; - int max_srq; - int max_srq_wr; - int max_srq_sge; - uint16_t max_pkeys; - uint8_t local_ca_ack_delay; - uint8_t phys_port_cnt; -}; - -enum ibv_mtu { - IBV_MTU_256 = 1, - IBV_MTU_512 = 2, - IBV_MTU_1024 = 3, - IBV_MTU_2048 = 4, - IBV_MTU_4096 = 5 -}; - -enum ibv_port_state { - IBV_PORT_NOP = 0, - IBV_PORT_DOWN = 1, - IBV_PORT_INIT = 2, - IBV_PORT_ARMED = 3, - IBV_PORT_ACTIVE = 4, - IBV_PORT_ACTIVE_DEFER = 5 -}; - -enum { - IBV_LINK_LAYER_UNSPECIFIED, - IBV_LINK_LAYER_INFINIBAND, - IBV_LINK_LAYER_ETHERNET, - - /* Leave a gap for future link layer types before starting with - * experimental link layer. - */ - IBV_EXP_LINK_LAYER_START = 32, - IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START -}; - -enum ibv_port_cap_flags { - IBV_PORT_SM = 1 << 1, - IBV_PORT_NOTICE_SUP = 1 << 2, - IBV_PORT_TRAP_SUP = 1 << 3, - IBV_PORT_OPT_IPD_SUP = 1 << 4, - IBV_PORT_AUTO_MIGR_SUP = 1 << 5, - IBV_PORT_SL_MAP_SUP = 1 << 6, - IBV_PORT_MKEY_NVRAM = 1 << 7, - IBV_PORT_PKEY_NVRAM = 1 << 8, - IBV_PORT_LED_INFO_SUP = 1 << 9, - IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, - IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, - IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, - IBV_PORT_CM_SUP = 1 << 16, - IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, - IBV_PORT_REINIT_SUP = 1 << 18, - IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, - IBV_PORT_VENDOR_CLASS = 1 << 24, - IBV_PORT_CLIENT_REG_SUP = 1 << 25, - IBV_PORT_IP_BASED_GIDS = 1 << 26, -}; - -struct ibv_port_attr { - enum ibv_port_state state; - enum ibv_mtu max_mtu; - enum ibv_mtu active_mtu; - int gid_tbl_len; - uint32_t port_cap_flags; - uint32_t max_msg_sz; - uint32_t bad_pkey_cntr; - uint32_t qkey_viol_cntr; - uint16_t pkey_tbl_len; - uint16_t lid; - uint16_t sm_lid; - uint8_t lmc; - uint8_t max_vl_num; - uint8_t sm_sl; - uint8_t subnet_timeout; - uint8_t init_type_reply; - uint8_t active_width; - uint8_t active_speed; - uint8_t phys_state; - uint8_t link_layer; - uint8_t reserved; -}; - -enum ibv_event_type { - IBV_EVENT_CQ_ERR, - IBV_EVENT_QP_FATAL, - IBV_EVENT_QP_REQ_ERR, - IBV_EVENT_QP_ACCESS_ERR, - IBV_EVENT_COMM_EST, - IBV_EVENT_SQ_DRAINED, - IBV_EVENT_PATH_MIG, - IBV_EVENT_PATH_MIG_ERR, - IBV_EVENT_DEVICE_FATAL, - IBV_EVENT_PORT_ACTIVE, - IBV_EVENT_PORT_ERR, - IBV_EVENT_LID_CHANGE, - IBV_EVENT_PKEY_CHANGE, - IBV_EVENT_SM_CHANGE, - IBV_EVENT_SRQ_ERR, - IBV_EVENT_SRQ_LIMIT_REACHED, - IBV_EVENT_QP_LAST_WQE_REACHED, - IBV_EVENT_CLIENT_REREGISTER, - IBV_EVENT_GID_CHANGE, - - /* new experimental events start here leaving enough - * room for 14 events which should be enough - */ - IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, - IBV_EXP_EVENT_DCT_ACCESS_ERR, - IBV_EXP_EVENT_DCT_REQ_ERR, -}; - -struct ibv_async_event { - union { - struct ibv_cq *cq; - struct ibv_qp *qp; - struct ibv_srq *srq; - struct ibv_exp_dct *dct; - int port_num; - /* For source compatible with Legacy API */ - uint32_t xrc_qp_num; - } element; - enum ibv_event_type event_type; -}; - -enum ibv_wc_status { - IBV_WC_SUCCESS, - IBV_WC_LOC_LEN_ERR, - IBV_WC_LOC_QP_OP_ERR, - IBV_WC_LOC_EEC_OP_ERR, - IBV_WC_LOC_PROT_ERR, - IBV_WC_WR_FLUSH_ERR, - IBV_WC_MW_BIND_ERR, - IBV_WC_BAD_RESP_ERR, - IBV_WC_LOC_ACCESS_ERR, - IBV_WC_REM_INV_REQ_ERR, - IBV_WC_REM_ACCESS_ERR, - IBV_WC_REM_OP_ERR, - IBV_WC_RETRY_EXC_ERR, - IBV_WC_RNR_RETRY_EXC_ERR, - IBV_WC_LOC_RDD_VIOL_ERR, - IBV_WC_REM_INV_RD_REQ_ERR, - IBV_WC_REM_ABORT_ERR, - IBV_WC_INV_EECN_ERR, - IBV_WC_INV_EEC_STATE_ERR, - IBV_WC_FATAL_ERR, - IBV_WC_RESP_TIMEOUT_ERR, - IBV_WC_GENERAL_ERR -}; -const char *ibv_wc_status_str(enum ibv_wc_status status); - -enum ibv_wc_opcode { - IBV_WC_SEND, - IBV_WC_RDMA_WRITE, - IBV_WC_RDMA_READ, - IBV_WC_COMP_SWAP, - IBV_WC_FETCH_ADD, - IBV_WC_BIND_MW, -/* - * Set value of IBV_WC_RECV so consumers can test if a completion is a - * receive by testing (opcode & IBV_WC_RECV). - */ - IBV_WC_RECV = 1 << 7, - IBV_WC_RECV_RDMA_WITH_IMM -}; - -enum ibv_wc_flags { - IBV_WC_GRH = 1 << 0, - IBV_WC_WITH_IMM = 1 << 1 -}; - -struct ibv_wc { - uint64_t wr_id; - enum ibv_wc_status status; - enum ibv_wc_opcode opcode; - uint32_t vendor_err; - uint32_t byte_len; - uint32_t imm_data; /* in network byte order */ - uint32_t qp_num; - uint32_t src_qp; - int wc_flags; - uint16_t pkey_index; - uint16_t slid; - uint8_t sl; - uint8_t dlid_path_bits; -}; - -enum ibv_access_flags { - IBV_ACCESS_LOCAL_WRITE = 1, - IBV_ACCESS_REMOTE_WRITE = (1<<1), - IBV_ACCESS_REMOTE_READ = (1<<2), - IBV_ACCESS_REMOTE_ATOMIC = (1<<3), - IBV_ACCESS_MW_BIND = (1<<4), - IBV_ACCESS_RELAXED_ORDERING = (1<<20), -}; - -struct ibv_pd { - struct ibv_context *context; - uint32_t handle; -}; - -enum ibv_xrcd_init_attr_mask { - IBV_XRCD_INIT_ATTR_FD = 1 << 0, - IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, - IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 -}; - -struct ibv_xrcd_init_attr { - uint32_t comp_mask; - int fd; - int oflags; -}; - -struct ibv_xrcd { - struct ibv_context *context; -}; - -enum ibv_rereg_mr_flags { - IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), - IBV_REREG_MR_CHANGE_PD = (1 << 1), - IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), - IBV_REREG_MR_KEEP_VALID = (1 << 3) -}; - -struct ibv_mr { - struct ibv_context *context; - struct ibv_pd *pd; - void *addr; - size_t length; - uint32_t handle; - uint32_t lkey; - uint32_t rkey; -}; - -enum ibv_mw_type { - IBV_MW_TYPE_1 = 1, - IBV_MW_TYPE_2 = 2 -}; - -struct ibv_mw { - struct ibv_context *context; - struct ibv_pd *pd; - uint32_t rkey; -}; - -struct ibv_global_route { - union ibv_gid dgid; - uint32_t flow_label; - uint8_t sgid_index; - uint8_t hop_limit; - uint8_t traffic_class; -}; - -struct ibv_grh { - uint32_t version_tclass_flow; - uint16_t paylen; - uint8_t next_hdr; - uint8_t hop_limit; - union ibv_gid sgid; - union ibv_gid dgid; -}; - -enum ibv_rate { - IBV_RATE_MAX = 0, - IBV_RATE_2_5_GBPS = 2, - IBV_RATE_5_GBPS = 5, - IBV_RATE_10_GBPS = 3, - IBV_RATE_20_GBPS = 6, - IBV_RATE_30_GBPS = 4, - IBV_RATE_40_GBPS = 7, - IBV_RATE_60_GBPS = 8, - IBV_RATE_80_GBPS = 9, - IBV_RATE_120_GBPS = 10, - IBV_RATE_14_GBPS = 11, - IBV_RATE_56_GBPS = 12, - IBV_RATE_112_GBPS = 13, - IBV_RATE_168_GBPS = 14, - IBV_RATE_25_GBPS = 15, - IBV_RATE_100_GBPS = 16, - IBV_RATE_200_GBPS = 17, - IBV_RATE_300_GBPS = 18 -}; - -/** - * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the - * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be - * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. - * @rate: rate to convert. - */ -int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; - -/** - * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. - * @mult: multiple to convert. - */ -enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; - -/** - * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. - * For example, IBV_RATE_5_GBPS will return the value 5000. - * @rate: rate to convert. - */ -int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; - -/** - * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. - * @mbps: value to convert. - */ -enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; - -struct ibv_ah_attr { - struct ibv_global_route grh; - uint16_t dlid; - uint8_t sl; - uint8_t src_path_bits; - uint8_t static_rate; - uint8_t is_global; - uint8_t port_num; -}; - -enum ibv_srq_attr_mask { - IBV_SRQ_MAX_WR = 1 << 0, - IBV_SRQ_LIMIT = 1 << 1 -}; - -struct ibv_srq_attr { - uint32_t max_wr; - uint32_t max_sge; - uint32_t srq_limit; -}; - -struct ibv_srq_init_attr { - void *srq_context; - struct ibv_srq_attr attr; -}; - -enum ibv_srq_type { - IBV_SRQT_BASIC, - IBV_SRQT_XRC -}; - -enum ibv_srq_init_attr_mask { - IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, - IBV_SRQ_INIT_ATTR_PD = 1 << 1, - IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, - IBV_SRQ_INIT_ATTR_CQ = 1 << 3, - IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 -}; - -struct ibv_srq_init_attr_ex { - void *srq_context; - struct ibv_srq_attr attr; - - uint32_t comp_mask; - enum ibv_srq_type srq_type; - struct ibv_pd *pd; - struct ibv_xrcd *xrcd; - struct ibv_cq *cq; -}; - -enum ibv_qp_type { - IBV_QPT_RC = 2, - IBV_QPT_UC, - IBV_QPT_UD, - /* XRC compatible code */ - IBV_QPT_XRC, - IBV_QPT_RAW_PACKET = 8, - IBV_QPT_RAW_ETH = 8, - IBV_QPT_XRC_SEND = 9, - IBV_QPT_XRC_RECV, - - /* Leave a gap for future qp types before starting with - * experimental qp types. - */ - IBV_EXP_QP_TYPE_START = 32, - IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START -}; - -struct ibv_qp_cap { - uint32_t max_send_wr; - uint32_t max_recv_wr; - uint32_t max_send_sge; - uint32_t max_recv_sge; - uint32_t max_inline_data; -}; - -struct ibv_qp_init_attr { - void *qp_context; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - struct ibv_qp_cap cap; - enum ibv_qp_type qp_type; - int sq_sig_all; - /* Below is needed for backwards compatabile */ - struct ibv_xrc_domain *xrc_domain; -}; - -enum ibv_qp_init_attr_mask { - IBV_QP_INIT_ATTR_PD = 1 << 0, - IBV_QP_INIT_ATTR_XRCD = 1 << 1, - IBV_QP_INIT_ATTR_RESERVED = 1 << 2 -}; - -struct ibv_qp_init_attr_ex { - void *qp_context; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - struct ibv_qp_cap cap; - enum ibv_qp_type qp_type; - int sq_sig_all; - - uint32_t comp_mask; - struct ibv_pd *pd; - struct ibv_xrcd *xrcd; -}; - -enum ibv_qp_open_attr_mask { - IBV_QP_OPEN_ATTR_NUM = 1 << 0, - IBV_QP_OPEN_ATTR_XRCD = 1 << 1, - IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, - IBV_QP_OPEN_ATTR_TYPE = 1 << 3, - IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 -}; - -struct ibv_qp_open_attr { - uint32_t comp_mask; - uint32_t qp_num; - struct ibv_xrcd *xrcd; - void *qp_context; - enum ibv_qp_type qp_type; -}; - -enum ibv_qp_attr_mask { - IBV_QP_STATE = 1 << 0, - IBV_QP_CUR_STATE = 1 << 1, - IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, - IBV_QP_ACCESS_FLAGS = 1 << 3, - IBV_QP_PKEY_INDEX = 1 << 4, - IBV_QP_PORT = 1 << 5, - IBV_QP_QKEY = 1 << 6, - IBV_QP_AV = 1 << 7, - IBV_QP_PATH_MTU = 1 << 8, - IBV_QP_TIMEOUT = 1 << 9, - IBV_QP_RETRY_CNT = 1 << 10, - IBV_QP_RNR_RETRY = 1 << 11, - IBV_QP_RQ_PSN = 1 << 12, - IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, - IBV_QP_ALT_PATH = 1 << 14, - IBV_QP_MIN_RNR_TIMER = 1 << 15, - IBV_QP_SQ_PSN = 1 << 16, - IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, - IBV_QP_PATH_MIG_STATE = 1 << 18, - IBV_QP_CAP = 1 << 19, - IBV_QP_DEST_QPN = 1 << 20 -}; - -enum ibv_qp_state { - IBV_QPS_RESET, - IBV_QPS_INIT, - IBV_QPS_RTR, - IBV_QPS_RTS, - IBV_QPS_SQD, - IBV_QPS_SQE, - IBV_QPS_ERR, - IBV_QPS_UNKNOWN -}; - -enum ibv_mig_state { - IBV_MIG_MIGRATED, - IBV_MIG_REARM, - IBV_MIG_ARMED -}; - -struct ibv_qp_attr { - enum ibv_qp_state qp_state; - enum ibv_qp_state cur_qp_state; - enum ibv_mtu path_mtu; - enum ibv_mig_state path_mig_state; - uint32_t qkey; - uint32_t rq_psn; - uint32_t sq_psn; - uint32_t dest_qp_num; - int qp_access_flags; - struct ibv_qp_cap cap; - struct ibv_ah_attr ah_attr; - struct ibv_ah_attr alt_ah_attr; - uint16_t pkey_index; - uint16_t alt_pkey_index; - uint8_t en_sqd_async_notify; - uint8_t sq_draining; - uint8_t max_rd_atomic; - uint8_t max_dest_rd_atomic; - uint8_t min_rnr_timer; - uint8_t port_num; - uint8_t timeout; - uint8_t retry_cnt; - uint8_t rnr_retry; - uint8_t alt_port_num; - uint8_t alt_timeout; -}; - -enum ibv_wr_opcode { - IBV_WR_RDMA_WRITE, - IBV_WR_RDMA_WRITE_WITH_IMM, - IBV_WR_SEND, - IBV_WR_SEND_WITH_IMM, - IBV_WR_RDMA_READ, - IBV_WR_ATOMIC_CMP_AND_SWP, - IBV_WR_ATOMIC_FETCH_AND_ADD -}; - -enum ibv_send_flags { - IBV_SEND_FENCE = 1 << 0, - IBV_SEND_SIGNALED = 1 << 1, - IBV_SEND_SOLICITED = 1 << 2, - IBV_SEND_INLINE = 1 << 3 -}; - -struct ibv_sge { - uint64_t addr; - uint32_t length; - uint32_t lkey; -}; - -struct ibv_send_wr { - uint64_t wr_id; - struct ibv_send_wr *next; - struct ibv_sge *sg_list; - int num_sge; - enum ibv_wr_opcode opcode; - int send_flags; - uint32_t imm_data; /* in network byte order */ - union { - struct { - uint64_t remote_addr; - uint32_t rkey; - } rdma; - struct { - uint64_t remote_addr; - uint64_t compare_add; - uint64_t swap; - uint32_t rkey; - } atomic; - struct { - struct ibv_ah *ah; - uint32_t remote_qpn; - uint32_t remote_qkey; - } ud; - } wr; - union { - union { - struct { - uint32_t remote_srqn; - } xrc; - } qp_type; - - uint32_t xrc_remote_srq_num; - }; -}; - -struct ibv_recv_wr { - uint64_t wr_id; - struct ibv_recv_wr *next; - struct ibv_sge *sg_list; - int num_sge; -}; - -struct ibv_mw_bind { - uint64_t wr_id; - struct ibv_mr *mr; - void *addr; - size_t length; - int send_flags; - int mw_access_flags; -}; - -struct ibv_srq { - struct ibv_context *context; - void *srq_context; - struct ibv_pd *pd; - uint32_t handle; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; - - /* below are for source compatabilty with legacy XRC, - * padding based on ibv_srq_legacy. - */ - uint32_t xrc_srq_num_bin_compat_padding; - struct ibv_xrc_domain *xrc_domain_bin_compat_padding; - struct ibv_cq *xrc_cq_bin_compat_padding; - void *ibv_srq_padding; - - /* legacy fields */ - uint32_t xrc_srq_num; - struct ibv_xrc_domain *xrc_domain; - struct ibv_cq *xrc_cq; -}; - -/* Not in use in new API, needed for compilation as part of source compat layer */ -enum ibv_event_flags { - IBV_XRC_QP_EVENT_FLAG = 0x80000000, -}; - - - -struct ibv_qp { - struct ibv_context *context; - void *qp_context; - struct ibv_pd *pd; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - uint32_t handle; - uint32_t qp_num; - enum ibv_qp_state state; - enum ibv_qp_type qp_type; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; -}; - -struct ibv_comp_channel { - struct ibv_context *context; - int fd; - int refcnt; -}; - -struct ibv_cq { - struct ibv_context *context; - struct ibv_comp_channel *channel; - void *cq_context; - uint32_t handle; - int cqe; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t comp_events_completed; - uint32_t async_events_completed; -}; - -struct ibv_ah { - struct ibv_context *context; - struct ibv_pd *pd; - uint32_t handle; -}; - -enum ibv_flow_flags { - IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, - IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, -}; - -enum ibv_flow_attr_type { - /* steering according to rule specifications */ - IBV_FLOW_ATTR_NORMAL = 0x0, - /* default unicast and multicast rule - - * receive all Eth traffic which isn't steered to any QP - */ - IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, - /* default multicast rule - - * receive all Eth multicast traffic which isn't steered to any QP - */ - IBV_FLOW_ATTR_MC_DEFAULT = 0x2, -}; - -enum ibv_flow_spec_type { - IBV_FLOW_SPEC_ETH = 0x20, - IBV_FLOW_SPEC_IPV4 = 0x30, - IBV_FLOW_SPEC_TCP = 0x40, - IBV_FLOW_SPEC_UDP = 0x41, -}; - -struct ibv_flow_eth_filter { - uint8_t dst_mac[6]; - uint8_t src_mac[6]; - uint16_t ether_type; - /* - * same layout as 802.1q: prio 3, cfi 1, vlan id 12 - */ - uint16_t vlan_tag; -}; - -struct ibv_flow_spec_eth { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_eth_filter val; - struct ibv_flow_eth_filter mask; -}; - -struct ibv_flow_ipv4_filter { - uint32_t src_ip; - uint32_t dst_ip; -}; - -struct ibv_flow_spec_ipv4 { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_ipv4_filter val; - struct ibv_flow_ipv4_filter mask; -}; - -struct ibv_flow_tcp_udp_filter { - uint16_t dst_port; - uint16_t src_port; -}; - -struct ibv_flow_spec_tcp_udp { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_tcp_udp_filter val; - struct ibv_flow_tcp_udp_filter mask; -}; - -struct ibv_flow_spec { - union { - struct { - enum ibv_flow_spec_type type; - uint16_t size; - } hdr; - struct ibv_flow_spec_eth eth; - struct ibv_flow_spec_ipv4 ipv4; - struct ibv_flow_spec_tcp_udp tcp_udp; - }; -}; - -struct ibv_flow_attr { - uint32_t comp_mask; - enum ibv_flow_attr_type type; - uint16_t size; - uint16_t priority; - uint8_t num_of_specs; - uint8_t port; - uint32_t flags; - /* Following are the optional layers according to user request - * struct ibv_flow_spec_xxx [L2] - * struct ibv_flow_spec_yyy [L3/L4] - */ -}; - -struct ibv_flow { - uint32_t comp_mask; - struct ibv_context *context; - uint32_t handle; -}; - -struct ibv_device; -struct ibv_context; - -struct ibv_device_ops { - struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); - void (*free_context)(struct ibv_context *context); -}; - -enum { - IBV_SYSFS_NAME_MAX = 64, - IBV_SYSFS_PATH_MAX = 256 -}; - -struct ibv_device { - struct ibv_device_ops ops; - enum ibv_node_type node_type; - enum ibv_transport_type transport_type; - /* Name of underlying kernel IB device, eg "mthca0" */ - char name[IBV_SYSFS_NAME_MAX]; - /* Name of uverbs device, eg "uverbs0" */ - char dev_name[IBV_SYSFS_NAME_MAX]; - /* Path to infiniband_verbs class device in sysfs */ - char dev_path[IBV_SYSFS_PATH_MAX]; - /* Path to infiniband class device in sysfs */ - char ibdev_path[IBV_SYSFS_PATH_MAX]; -}; - -struct verbs_device { - struct ibv_device device; /* Must be first */ - size_t sz; - size_t size_of_context; - int (*init_context)(struct verbs_device *device, - struct ibv_context *ctx, int cmd_fd); - void (*uninit_context)(struct verbs_device *device, - struct ibv_context *ctx); - /* future fields added here */ -}; - -struct ibv_context_ops { - int (*query_device)(struct ibv_context *context, - struct ibv_device_attr *device_attr); - int (*query_port)(struct ibv_context *context, uint8_t port_num, - struct ibv_port_attr *port_attr); - struct ibv_pd * (*alloc_pd)(struct ibv_context *context); - int (*dealloc_pd)(struct ibv_pd *pd); - struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, - int access); - struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, - int flags, - struct ibv_pd *pd, void *addr, - size_t length, - int access); - int (*dereg_mr)(struct ibv_mr *mr); - struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); - int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, - struct ibv_mw_bind *mw_bind); - int (*dealloc_mw)(struct ibv_mw *mw); - struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector); - int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); - int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); - void (*cq_event)(struct ibv_cq *cq); - int (*resize_cq)(struct ibv_cq *cq, int cqe); - int (*destroy_cq)(struct ibv_cq *cq); - struct ibv_srq * (*create_srq)(struct ibv_pd *pd, - struct ibv_srq_init_attr *srq_init_attr); - int (*modify_srq)(struct ibv_srq *srq, - struct ibv_srq_attr *srq_attr, - int srq_attr_mask); - int (*query_srq)(struct ibv_srq *srq, - struct ibv_srq_attr *srq_attr); - int (*destroy_srq)(struct ibv_srq *srq); - int (*post_srq_recv)(struct ibv_srq *srq, - struct ibv_recv_wr *recv_wr, - struct ibv_recv_wr **bad_recv_wr); - struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); - int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr); - int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, - int attr_mask); - int (*destroy_qp)(struct ibv_qp *qp); - int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr); - int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad_wr); - struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); - int (*destroy_ah)(struct ibv_ah *ah); - int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, - uint16_t lid); - int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, - uint16_t lid); - void (*async_event)(struct ibv_async_event *event); -}; - -struct ibv_context { - struct ibv_device *device; - struct ibv_context_ops ops; - int cmd_fd; - int async_fd; - int num_comp_vectors; - pthread_mutex_t mutex; - void *abi_compat; -}; - -enum verbs_context_mask { - VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, - VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, - VERBS_CONTEXT_QP = (uint64_t)1 << 2, - VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, - VERBS_CONTEXT_EXP = (uint64_t)1 << 62 -}; - -struct verbs_context { - /* "grows up" - new fields go here */ - int (*_reserved_2) (void); - int (*destroy_flow) (struct ibv_flow *flow); - int (*_reserved_1) (void); - struct ibv_flow * (*create_flow) (struct ibv_qp *qp, - struct ibv_flow_attr *flow_attr); - struct ibv_qp * (*open_qp)(struct ibv_context *context, - struct ibv_qp_open_attr *attr); - struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, - struct ibv_qp_init_attr_ex *qp_init_attr_ex); - int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); - struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, - struct ibv_srq_init_attr_ex *srq_init_attr_ex); - struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, - struct ibv_xrcd_init_attr *xrcd_init_attr); - int (*close_xrcd)(struct ibv_xrcd *xrcd); - uint64_t has_comp_mask; - size_t sz; /* Must be immediately before struct ibv_context */ - struct ibv_context context;/* Must be last field in the struct */ -}; - -/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) -{ - return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? - NULL : container_of(ctx, struct verbs_context, context); -} - -#define verbs_get_ctx_op(ctx, op) ({ \ - struct verbs_context *_vctx = verbs_get_ctx(ctx); \ - (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ - !_vctx->op) ? NULL : _vctx; })*/ - -#define verbs_set_ctx_op(_vctx, op, ptr) ({ \ - struct verbs_context *vctx = _vctx; \ - if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ - vctx->op = ptr; }) - -static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) -{ - return (dev->ops.alloc_context) ? - NULL : container_of(dev, struct verbs_device, device); -} - typedef enum ibv_return_enum { IBV_SUCCESS = 0, //!< The operation was successful } ibv_return_t; ncclResult_t wrap_ibv_symbols(void); +/* NCCL wrappers of IB verbs functions */ ncclResult_t wrap_ibv_fork_init(void); ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); @@ -1087,9 +66,6 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); -static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { - return qp->context->ops.post_send(qp, wr, bad_wr); -} static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ diff --git a/src/include/info.h b/src/include/info.h index 193d820f51..ab07b5e814 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -26,6 +26,7 @@ typedef enum : uint8_t { ncclPatternCollnetChain, ncclPatternCollnetDirect, ncclPatternNvls, + ncclPatternNvlsTree, ncclPatternSend, ncclPatternRecv } ncclPattern_t; @@ -94,7 +95,6 @@ struct ncclCudaStreamList { struct ncclCudaStreamList *next; cudaStream_t stream; }; - struct ncclTasks { struct Peer { bool sendSeen, recvSeen; @@ -104,7 +104,8 @@ struct ncclTasks { struct ncclIntruQueue collQueue; size_t collBytesTotal; struct Peer* peers/*[nRanks]*/; - int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/; + int *p2pSendOrder, *p2pRecvOrder; + int p2pOrderSteps; int nTasksColl, nTasksP2p; // The list of user streams aggregated over all tasks present. diff --git a/src/include/net.h b/src/include/net.h index 5a7b5e3a74..b5df589683 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit(); ncclResult_t ncclNetInit(struct ncclComm* comm); int ncclNetVersion(struct ncclComm* comm); -// Translation to external API -static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; } -static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; } -static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; } -static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; } -/* DMA-BUF support */ -static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } -static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } -static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; } -static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; } - // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); diff --git a/src/include/nvtx3/nvtx3.hpp b/src/include/nvtx3/nvtx3.hpp index cb0ef6858f..8c62acd469 100644 --- a/src/include/nvtx3/nvtx3.hpp +++ b/src/include/nvtx3/nvtx3.hpp @@ -126,7 +126,7 @@ * Systems: * * \image html - * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png + * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png * * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add * ranges to your code that automatically use the name of the enclosing function @@ -561,18 +561,27 @@ /* Temporary helper #defines, removed with #undef at end of header */ -#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) -#if defined(_MSC_VER) && _MSC_VER < 1914 -/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser - * that does not work with domain::get's specialization for domain::global, - * and would require extra conditions to make SFINAE work for the overloaded - * get() functions. This macro disables use of overloaded get() in order to - * work with VS2015 and versions of VS2017 below 15.7, without penalizing - * users of newer compilers. Building with this flag set to 0 means errors - * when defining tag structs (see documentation for domain, named_category, - * and registered_string) will have more complex compiler error messages - * instead of the clear static_assert messages from the get() overloads. +/* Some compilers do not correctly support SFINAE, which is used in this API + * to detect common usage errors and provide clearer error messages (by using + * static_assert) than the compiler would produce otherwise. These compilers + * will generate errors while compiling this file such as: + * + * error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’ + * + * The following compiler versions are known to have this problem, and so are + * set by default to disable the SFINAE-based checks: + * + * - All MSVC versions prior to VS2017 Update 7 (15.7) + * - GCC 8.1-8.3 (the problem was fixed in GCC 8.4) + * + * If you find your compiler hits this problem, you can work around it by + * defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this + * header, or you can add a check for your compiler version to this #if. + * Also, please report the issue on the NVTX github page. */ +#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) +#if defined(_MSC_VER) && _MSC_VER < 1914 \ + || defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0 #else #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1 diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h index c2c1ac596f..7c166bd34b 100644 --- a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h +++ b/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h @@ -1,30 +1,33 @@ +/* +* Copyright 2021-2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif -/* - * Helper array to get the alignment for each predefined C language type. - */ - typedef void* pointer_type; -#if __STDC_VERSION__ >= 201112L /* or CPP11 */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +#include #include +#endif + +/* `alignof` is available as of C11 or C++11 */ +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L) + #define nvtx_alignof(type) alignof(type) #define nvtx_alignof2(type,tname) alignof(type) -#else /* __STDC_VERSION__ >= 201112L */ -#ifndef __cplusplus -#include -#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d) -#define nvtx_alignof2(type,tname) nvtx_alignof(type) +#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */ -#else /* __cplusplus */ - -#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE -#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME -#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d) -#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d) +/* Create helper structs to determine type alignment. */ +#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type +#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname MKTYPEDEF(char); MKTYPEDEF2(unsigned char, uchar); @@ -54,22 +57,33 @@ MKTYPEDEF(size_t); MKTYPEDEF(pointer_type); MKTYPEDEF(wchar_t); -#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) - {sizeof(char8_t), nvtx_alignof(char8_t)}, + +/* `char8_t` is available as of C++20 or C23 */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) MKTYPEDEF(char8_t); #endif -#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L) + +/* `char16_t` and `char32_t` are available as of C++11 or C11 */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) MKTYPEDEF(char16_t); MKTYPEDEF(char32_t); #endif +/* C requires to include stddef.h to use `offsetof` */ +#ifndef __cplusplus +#include +#endif + +#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d) +#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d) + +#endif /* __STDC_VERSION__ >= 201112L */ + #undef MKTYPEDEF #undef MKTYPEDEF2 -#endif /* __cplusplus */ -#endif /* __STDC_VERSION__ >= 201112L */ - /* + * Helper array to get the alignment for each predefined C/C++ language type. * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`. */ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = @@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_ /*** Special character types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)}, - /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ -#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) - {sizeof(char8_t), nvtx_alignof(char8_t)}, + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)}, #else - {0, 0}, + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0}, #endif -#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L) + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)} #else @@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_ }; #undef nvtx_alignof -#undef nvtx_alignof2 \ No newline at end of file +#undef nvtx_alignof2 diff --git a/src/include/p2p.h b/src/include/p2p.h index 69d1ea77c1..426a15017a 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -9,4 +9,21 @@ #ifndef NCCL_P2P_H_ #define NCCL_P2P_H_ +#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR + +typedef struct { + int data; // Currently only support an fd based descriptor +} ncclCuDesc; + +typedef union { + // Legacy CUDA IPC + cudaIpcMemHandle_t devIpc; + // cuMem API support + ncclCuDesc cuDesc; +} ncclIpcDesc; + +ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); +ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); +ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); + #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index 83b8937861..17db4bcef0 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -15,11 +15,13 @@ #include "ipcsocket.h" #include #include "shm.h" +#include "p2p.h" enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; +enum { proxyRecv=0, proxySend=1 }; struct ncclProxyArgs; -typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*); +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); @@ -130,18 +132,11 @@ struct ncclProxySharedP2p { int size; char* cudaBuff; char* hostBuff; - cudaIpcMemHandle_t ipc; + // CUDA IPC + ncclIpcDesc ipcDesc; struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv }; -struct ncclProxySharedCollNet { - int size; - char* cudaBuff; - char* hostBuff; - struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; - void* resources; -}; - struct ncclProxyPeer { struct ncclProxySharedP2p send; struct ncclProxySharedP2p recv; @@ -165,7 +160,6 @@ struct ncclProxyProgressState { bool stop; struct ncclProxyPeer** localPeers; struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; - struct ncclProxySharedCollNet collNet; struct ncclProxyArgs* active; struct ncclProxyArgs* pool; struct ncclProxyPool* pools; @@ -192,12 +186,27 @@ struct ncclProxyAsyncOp { struct ncclProxyLocalPeer { struct ncclSocket sock; - int localRank; + int tpRank; + int tpLocalRank; ncclProxyAsyncOp* asyncOps; int asyncOpCounter; }; struct ncclProxyState { + int refCount; + int tpRank; + int tpnRanks; + int tpLocalnRanks; + int cudaDev; + int p2pnChannels; + int p2pChunkSize; + int nChannels; + int buffSizes[NCCL_NUM_PROTOCOLS]; + bool allocP2pNetLLBuffers; + bool dmaBufSupport; + ncclNet_t* ncclNet; + ncclCollNet_t* ncclCollNet; + volatile uint32_t* abortFlag; // Service thread pthread_t thread; struct ncclSocket* listenSock; @@ -209,6 +218,7 @@ struct ncclProxyState { struct ncclSocket* peerSocks; struct ncclProxyOps* proxyOps; void** sharedDevMems; + struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS) // Progress thread struct ncclProxyProgressState progressState; @@ -228,13 +238,14 @@ enum proxyConnectState { struct ncclProxyConnection { int send, transport, shared; - int localRank; + int tpLocalRank, sameProcess; struct ncclSocket* sock; struct ncclTransportComm* tcomm; struct ncclProxyArgs *proxyAppend; struct ncclProxyArgs **proxyAppendPtr; void* transportResources; proxyConnectState state; + struct ncclCollNetSharedRes* collNet; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); @@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); ncclResult_t ncclProxyCreate(struct ncclComm* comm); -ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn); +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); enum ncclProxyMsgType { ncclProxyMsgInit = 1, ncclProxyMsgSharedInit = 2, @@ -260,22 +271,24 @@ enum ncclProxyMsgType { ncclProxyMsgClose = 6, ncclProxyMsgAbort = 7, ncclProxyMsgStop = 8, - ncclProxyMsgConvertFd = 9 // cuMem API support + ncclProxyMsgConvertFd = 9, // cuMem API support (UDS) }; // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed -ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); +ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received -ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); -ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); +ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); +ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); -ncclResult_t ncclProxyDestroy(struct ncclComm* comm); +ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd); + +ncclResult_t ncclProxyStop(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); +ncclResult_t ncclProxyDestroy(struct ncclComm* comm); -enum { proxyRecv=0, proxySend=1 }; -ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex); +ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex); #endif diff --git a/src/include/transport.h b/src/include/transport.h index 01812af1a3..f3f47065a2 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -36,7 +36,6 @@ struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; - int netDev; int gdrSupport; bool hasFineGrain; uint64_t hostHash; @@ -45,7 +44,6 @@ struct ncclPeerInfo { int64_t busId; struct ncclComm* comm; int cudaCompCap; - int virtualId; }; #define CONNECT_SIZE 128 @@ -53,15 +51,46 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; +#if CUDART_VERSION >= 12010 + +#define NVLS_HANDLE_SIZE 64 +struct ncclNvlsSharedRes { + int refCount; + CUmulticastObjectProp properties; + CUmemAccessDesc accessDesc; + int dev; + size_t size; + size_t granularity; + CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer + char* mcBuff; // Multicast NVLS buffer address + CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer + char* ucBuff; // Unicast NVLS buffer address + char shareableHandle[NVLS_HANDLE_SIZE]; + int nChannels; +}; + +#endif /* CUDART_VERSION >= 12010 */ + +struct ncclCollNetSharedRes { + int refCount; + int size; + char* cudaBuff; + char* hostBuff; + struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; + void* resources; + int nChannels; + size_t buffSize; +}; + struct ncclTransportComm { ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(struct ncclConnector*); - ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels); - ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); - ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); - ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm); - ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*); + ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels); + ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState); + ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); }; struct ncclTransport { @@ -74,10 +103,9 @@ struct ncclTransport { ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); -#if CUDART_VERSION >= 12010 -ncclResult_t ncclNvlsSetup(struct ncclComm* comm); +ncclResult_t ncclNvlsInit(struct ncclComm* comm); +ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); ncclResult_t ncclNvlsFree(struct ncclComm* comm); -#endif enum { collNetRecv=0, collNetSend=1 }; int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); diff --git a/src/init.cc b/src/init.cc index 9b035f3faa..9778cb7bda 100644 --- a/src/init.cc +++ b/src/init.cc @@ -53,7 +53,7 @@ #endif const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAllPivot" }; -const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" }; +const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; const char* ncclDevRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "Max", "Min", "PreMulSum", "SumPostDiv" }; const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"}; @@ -64,6 +64,7 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT); struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {}; +static ncclResult_t commReclaim(ncclComm_t comm); static uint64_t hashUniqueId(ncclUniqueId const &id) { char const *bytes = (char const*)&id; @@ -284,7 +285,7 @@ void ncclCommPushFree(struct ncclComm* comm, void* obj) { } static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) { - CUDACHECK(cudaFree(dtor->obj)); + NCCLCHECK(ncclCudaFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) { @@ -327,8 +328,9 @@ static ncclResult_t commFree(ncclComm_t comm) { /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ - if (comm->proxyState.thread) - pthread_join(comm->proxyState.thread, nullptr); + if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) { + pthread_join(comm->proxyState->thread, nullptr); + } delete[] comm->userRedOps; @@ -368,19 +370,29 @@ static ncclResult_t commFree(ncclComm_t comm) { } free(comm->rankToNode); free(comm->rankToLocalRank); + free(comm->collNetHeads); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); for (int channel=0; channelchannels+channel, comm->nRanks)); + NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks, 1, comm->localRanks)); if (comm->doneEvent != NULL) CUDACHECK(hipEventDestroy(comm->doneEvent)); - if (comm->initState == ncclSuccess) { - NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream)); - NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream)); + if (comm->sharedRes) { + if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) { + for (int c=0; csharedRes->peers[c]) free(comm->sharedRes->peers[c]); + if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]); + } + free(comm->sharedRes->tpRankToLocalRank); + NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); + NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); + NCCLCHECK(ncclProxyDestroy(comm)); + free(comm->sharedRes); + } } #if CUDART_VERSION >= 12010 @@ -397,8 +409,14 @@ static ncclResult_t commFree(ncclComm_t comm) { ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); - ncclCudaHostFree((void *)comm->abortFlag); - free(comm->netName); + if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) { + NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); + free(comm->abortFlagRefCount); + } + free((void*)comm->config.netName); + + free(comm->topParentRanks); + free(comm->topParentLocalRanks); commPoison(comm); // poison comm before free to avoid comm reuse. free(comm); @@ -460,7 +478,7 @@ exit: return ret; } -static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtualId) { +static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); return ncclInvalidArgument; @@ -470,20 +488,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua return ncclInvalidArgument; } - struct ncclComm* comm; - /* Cuurently we calloc comm in ncclCommInitRankDev for async function support. - * This 'if' structure is designed to consider the case where commAlloc is called - * in other cases except ncclCommInitRankDev. */ - if (*comret == NULL) { - /* user requests a new communicator */ - NCCLCHECK(ncclCalloc(&comm, 1)); - NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1)); - NCCLCHECK(ncclCommSetAsyncError(comm, ncclInProgress)); - } else { - /* We already allocated a communicator in ncclCommInitRankDev. */ - comm = *comret; - } - ncclMemoryStackConstruct(&comm->memPermanent); ncclMemoryStackConstruct(&comm->memScoped); comm->destructorHead = nullptr; @@ -491,8 +495,14 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua comm->nRanks = ndev; NCCLCHECK(ncclNetInit(comm)); - INFO(NCCL_INIT, "Using network %s", ncclNetName(comm)); + INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); + if (parent && parent->config.splitShare) { + if (parent->ncclNet != comm->ncclNet) { + WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name); + return ncclInvalidUsage; + } + } // Try to create a CUDA object right away. If there is something wrong with // the device we're on (failure cause #1) , better know it early. hipEvent_t doneEvent; @@ -502,13 +512,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming)); #endif - NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream)); - NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream)); comm->doneEvent = doneEvent; comm->lastStream = nullptr; - comm->virtualId = virtualId; - cudaGetDevice(&comm->cudaDev); + CUDACHECK(cudaGetDevice(&comm->cudaDev)); + NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); comm->compCap = ncclCudaCompCap(); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); @@ -529,6 +537,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua comm->collTraceThread = 0; #endif comm->collNetSupport = 0; + memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix)); ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); @@ -546,10 +555,30 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua // Mark channels as non initialized. for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1; - ncclIntruQueueMpscConstruct(&comm->callbackQueue); + if (parent == NULL || !parent->config.splitShare) { + struct ncclSharedResources* sharedRes = NULL; + NCCLCHECK(ncclCalloc(&sharedRes, 1)); + /* most of attributes are assigned later in initTransportsRank(). */ + sharedRes->owner = comm; + sharedRes->tpNRanks = comm->nRanks; + NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks)); + NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream)); + NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream)); + comm->sharedRes = sharedRes; + sharedRes->refCount = 1; + } else { + comm->sharedRes = parent->sharedRes; + ncclAtomicRefCountIncrement(&parent->sharedRes->refCount); + } CUDACHECK(hipDeviceGetAttribute(&comm->WarpSize, hipDeviceAttributeWarpSize, comm->cudaDev)); - *comret = comm; + if (comm->topParentRanks == NULL) { + NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks)); + for (int i = 0; i < comm->nRanks; ++i) + comm->topParentRanks[i] = i; + } + + ncclIntruQueueMpscConstruct(&comm->callbackQueue); return ncclSuccess; } @@ -559,8 +588,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->deviceStream), ret, fail); - NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; @@ -608,7 +637,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; if (comm->channels[c].ring.userRanks != nullptr) { - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); } } @@ -633,10 +662,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { NCCLCHECK(mscclInit(comm)); } - NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); exit: - CUDACHECK(cudaStreamSynchronize(comm->deviceStream.cudaStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream)); + CUDACHECK(cudaStreamSynchronize(comm->sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream)); return ret; fail: goto exit; @@ -661,7 +690,6 @@ static void showVersion() { static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { info->rank = comm->rank; - info->virtualId = comm->virtualId; CUDACHECK(cudaGetDevice(&info->cudaDev)); info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; @@ -689,7 +717,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u comm->hasFineGrain = info->hasFineGrain; info->comm = comm; - info->cudaCompCap = ncclCudaCompCap(); + info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap; return ncclSuccess; } @@ -739,6 +767,13 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); + if (comm->sharedRes->owner != comm) { + /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */ + comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize); + } else { + comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize; + } + INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize); return ncclSuccess; } @@ -748,7 +783,7 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0); NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); -static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) { +static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) { ncclResult_t ret = ncclSuccess; int* heads = NULL; int rank = comm->rank; @@ -758,6 +793,13 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN int nHeads = collNetGraph->nChannels; int highestTransportType0, highestTransportType1; char line[1024]; + bool share; + + struct collnetShareInfo { + int headPosition; + int isMaster; + }; + struct collnetShareInfo* infos = NULL; NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail); // Head GPU index is always 0 @@ -765,18 +807,124 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN heads[c] = collNetGraph->intra[c * comm->localRanks + 0]; } - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - for (int h = 0; h < nHeads; h++) { - const int head = heads[h]; - collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv); - if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend); + comm->collNetHeads = heads; + comm->collNetHeadsNum = nHeads; + if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) { + NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail); + /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator + * based on heads with the same head position in each node, as long as the collnet heads of child comm + * can match parent's heads, we can let child communicator share parent's collnet resources. */ + for (int h = 0; h < nHeads; ++h) { + int prev = INT_MIN; + struct collnetShareInfo* myinfo; + + share = true; + myinfo = infos + comm->rank; + memset(myinfo, 0, sizeof(struct collnetShareInfo)); + /* find the child head position in parent collnet heads. */ + if (heads[h] == comm->rank) { + myinfo->headPosition = -1; + myinfo->isMaster = 1; + for (int th = 0; th < parent->collNetHeadsNum; ++th) + if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) { + myinfo->headPosition = th; + break; + } + } + + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail); + for (int i = 0; i < comm->nRanks; ++i) { + if (infos[i].isMaster) { + if (prev == INT_MIN) + prev = infos[i].headPosition; + + if (infos[i].headPosition == -1 || prev != infos[i].headPosition) { + share = false; + break; + } + } + } + + if (share) { + if (myinfo->isMaster) { + comm->collNetSharedRes = parent->collNetSharedRes; + comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels); + for (int c = 0; c < comm->collNetChannels; ++c) + NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); + } + } else { + /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot + * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be + * lifted by sharp plugin/IB hardware in the future. */ + collNetSetupFail = 1; + if (comm->rank == 0) { + WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); + } + goto fail; + } } - // Verify CollNet setup across ranks after trying the first channel - if (c == 0) { - NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); + share = true; + } else { + /* this allocated buffer will be freed on proxy side */ + NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); + /* TODO: min or max? */ + comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels); + comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; + for (int c = 0; c < comm->collNetChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); + for (int h = 0; h < nHeads; h++) { + const int head = heads[h]; + collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv); + if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend); + } + // Verify CollNet setup across ranks after trying the first channel + if (c == 0) { + NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); + } } + share = false; } + + if (share) { + memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix)); + } else { + do { + /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some + ranks don't connect to sharp we enable a (redop,type) if any rank claims + support. */ + const ncclRedOp_t redops[] = {ncclSum, ncclProd, ncclMin, ncclMax}; + uint8_t(*matrix)[4][ncclNumTypes]; + bool isHead = false; + matrix = nullptr; + NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); + for (int h = 0; h < nHeads; h++) isHead |= (heads[h] == comm->rank); + if (isHead) { + for (int ty=0; ty < ncclNumTypes; ty++) { + for (int i=0; i < 4; i++) { + int support = 0; + NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, redops[i], &support), ret, matrix_end); + // bit 0 = not supported, bit 1 = supported + matrix[rank][redops[i]][ty] = 1<<(support ? 1 : 0); + } + } + } + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end); + for (int ty=0; ty < ncclNumTypes; ty++) { + for (int i=0; i < 4; i++) { + int op = redops[i]; + uint8_t accum = 0; + for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty]; + // We support (redop, type) if some rank supports it and no rank doesn't support it + comm->collNetSupportMatrix[op][ty] = (accum == (1<<1)); + } + } + matrix_end: + free(matrix); + if (ret != ncclSuccess) goto fail; + } while (0); + } + // Verify CollNet setup across ranks after trying all channels NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); @@ -819,6 +967,9 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN // Exchange highest intra-node transport type among ranks // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + if (share) { + comm->intraHighestTransportType = std::max(comm->intraHighestTransportType, parent->intraHighestTransportType); + } NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); for (int i = 0; i < comm->localRanks; i++) { if (highestTypes[i] > comm->intraHighestTransportType) @@ -828,7 +979,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN INFO(NCCL_INIT, "rank %d Connected CollNet", rank); exit: - free(heads); + free(infos); return ret; fail: ncclTransportCollNetFree(comm); @@ -836,18 +987,19 @@ fail: goto exit; } -static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { +static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} // 2. { nChannels, graphInfo, topoRanks } ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; - uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); cpu_set_t affinitySave; struct ncclTopoGraph ringGraph; struct ncclTopoGraph treeGraph; struct ncclTopoGraph collNetGraph; + struct ncclTopoGraph nvlsGraph; + struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; struct graphInfo { int pattern; @@ -860,11 +1012,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm }; struct allGatherInfo { - int netDev; - int collNetSupport; - struct graphInfo tree; - struct graphInfo ring; - struct graphInfo collNet; + struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; struct ncclTopoRanks topoRanks; int nc; bool pivotA2AEnabled; @@ -880,38 +1028,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int* nvbPeers = NULL; struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; - - TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); - NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail); + int *topParentLocalRanks = NULL; + int tpProxyRank; // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root - NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail); + NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); - //If virtualId == -1 multiRank support has not been requested by user, using original interface - if (comm->virtualId == -1) { - for (int i = 0; i < nranks; i++) { - if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); - ret = ncclInvalidUsage; - goto fail; - } - } - } - else { - //Multiple ranks can use the same device, but need to have different virtualId's. - for (int i = 0; i < nranks; i++) { - for (int j=0; j < nranks; j++) { - if (j==i) continue; - if((comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash) && - (comm->peerInfo[i].busId == comm->peerInfo[j].busId) && - (comm->peerInfo[i].virtualId == comm->peerInfo[j].virtualId)) { - WARN("Duplicate virtualId detected : rank %d and rank %d both on GPU device %lx virtualId %d", - i, j, comm->peerInfo[rank].busId, comm->peerInfo[i].virtualId); - return ncclInvalidUsage; - } - } + for (int i = 0; i < nranks; i++) { + if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); + ret = ncclInvalidUsage; + goto fail; } } // AllGather1 - end @@ -919,6 +1048,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm do { // Compute intra-process ranks int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; + for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap); + for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap); for (int i = 0; i < nranks; i++) { if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { @@ -983,8 +1114,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); } - // Launch proxy service thread - NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); + // Determine local CollNet support + if (collNetSupport(comm)) { + char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); + if (collNetEnable != NULL) { + INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); + if (strcmp(collNetEnable, "1") == 0) { + comm->collNetSupport = 1; + } + } + } + + // Determine local Nvls support + NCCLCHECK(ncclNvlsInit(comm)); // Get rings and trees ringGraph.id = 0; @@ -1007,8 +1149,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); + if (comm->collNetSupport) { + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); + } else { + collNetGraph.nChannels = 0; + } + + nvlsGraph.id = 3; + nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; + nvlsGraph.collNet = 0; + nvlsGraph.minChannels = 1; + nvlsGraph.maxChannels = MAXCHANNELS; + if (comm->nvlsSupport) { + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); + } else { + nvlsGraph.nChannels = 0; + } bool allXgmi, hasPeerAccess; allXgmi = true; @@ -1036,22 +1194,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; if (comm->rank == ncclParamGraphDumpFileRank()) { - struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; - NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail); + struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph }; + NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail); } - // Determine local CollNet support before all-gather - if (collNetSupport(comm)) { - char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); - if (collNetEnable != NULL) { - INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); - if (strcmp(collNetEnable, "1") == 0) { - comm->collNetSupport = 1; - } - } - } - if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0; - if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { if (rcclParamP2pNetDisable() == 0) { if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1; @@ -1065,55 +1211,38 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; - if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); - if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) allGather3Data[rank].nc = 4; - if (((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; - NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail); - allGather3Data[rank].tree.pattern = treeGraph.pattern; - allGather3Data[rank].tree.nChannels = treeGraph.nChannels; - allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; - allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra; - allGather3Data[rank].tree.bwInter = treeGraph.bwInter; - allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; - allGather3Data[rank].tree.typeInter = treeGraph.typeInter; - allGather3Data[rank].ring.pattern = ringGraph.pattern; - allGather3Data[rank].ring.nChannels = ringGraph.nChannels; - allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; - allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra; - allGather3Data[rank].ring.bwInter = ringGraph.bwInter; - allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; - allGather3Data[rank].ring.typeInter = ringGraph.typeInter; - allGather3Data[rank].collNet.pattern = collNetGraph.pattern; - allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels; - allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; - allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra; - allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter; - allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; - allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter; - allGather3Data[rank].collNetSupport = comm->collNetSupport; allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable(); comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable(); allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled; allGather3Data[rank].mscclEnabled = comm->topo->mscclEnabled; - comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count) - ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels; - NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail); + for (int a=0; apattern; + allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels; + allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels; + allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra; + allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter; + allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra; + allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter; + } + + comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); @@ -1129,7 +1258,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch - nodesTreePatterns[node] = allGather3Data[r].tree.pattern; + nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern; } comm->rankToNode[r] = node; } @@ -1172,32 +1301,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int nc; nc = allGather3Data[0].nc; for (int i=0; ipeerInfo[i].netDev = allGather3Data[i].netDev; allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks - treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); - treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); - treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra); - treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter); - treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); - treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); - ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); - ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); - ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra); - ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter); - ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); - ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); - collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); - collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); - collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra); - collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter); - collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); - collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); - comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled; comm->topo->ll128Enabled = comm->topo->ll128Enabled && allGather3Data[i].ll128Enabled; comm->topo->mscclEnabled = comm->topo->mscclEnabled && allGather3Data[i].mscclEnabled; + for (int a=0; anChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels); + graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels); + graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra); + graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter); + graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra); + graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); + } + if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; + if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0; } comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = @@ -1226,8 +1345,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail); + NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nc), ret, fail); if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph)); // AllGather3 - end @@ -1248,6 +1367,29 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail); + // Compute nChannels per peer for p2p + NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); + + /* until now, all info of comm should be known. We can initialize shared resources and + * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before + * all proxy operations. */ + if (comm->sharedRes->owner == comm) { + comm->sharedRes->tpNLocalRanks = comm->localRanks; + comm->sharedRes->magic = comm->magic; + comm->sharedRes->tpNChannels = comm->nChannels; + comm->sharedRes->tpP2pNChannels = comm->p2pnChannels; + memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks); + } + NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail); + for (int i = 0; i < comm->localRanks; ++i) { + int tpRank = comm->topParentRanks[comm->localRankToRank[i]]; + topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank]; + } + comm->topParentLocalRanks = topParentLocalRanks; + + // Launch proxy service thread, after this, the proxy calls can be used. + NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); + // Connect with prev/next for each ring for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; @@ -1278,42 +1420,44 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail); INFO(NCCL_INIT, "Connected all trees"); - // Check if we can setup CollNet - if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph); + // Setup NVLS + NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); + // And NVLS trees if needed + if (comm->nvlsSupport && comm->localRanks > 1) { + for (int c=0; cnvlsChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail); + INFO(NCCL_INIT, "Connected NVLS tree"); + } #if CUDART_VERSION >= 12010 - NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail); + // Check if we can setup CollNet + if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph); #endif TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - do { - int myCompCap = comm->peerInfo[rank].cudaCompCap; - int minCompCap = myCompCap, maxCompCap = myCompCap; - for (int i = 0; i < nranks; i++) { - comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); - maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); - } - NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail); - } while(0); - - // Compute nChannels per peer for p2p - NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); + NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); do { // Setup p2p structures in comm->tasks struct ncclTasks* tasks = &comm->tasks; - int nRanks = comm->nRanks; int node = comm->node; int nNodes = comm->nNodes; struct ncclNodeRanks *nodeRanks = comm->nodeRanks; int localRank = comm->localRank; - tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - int s=0, r=0; + // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8. + int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2); + tasks->p2pOrderSteps = comm->nNodes * steps; + tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + int i=0; // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. for (int d=0; d <= nNodes/4; d++) { @@ -1323,18 +1467,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm sched_delta: int recvNode = (node+nNodes-delta)%nNodes; int sendNode = (node+delta)%nNodes; - int steps = comm->maxLocalRanks; for (int step=0; step < steps; step++) { int recvIndex = (localRank-step+steps)%steps; - if (recvIndex < nodeRanks[recvNode].localRanks) { - tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex]; - r++; - } + int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; + tasks->p2pRecvOrder[i] = recvRank; int sendIndex = (localRank+step)%steps; - if (sendIndex < nodeRanks[sendNode].localRanks) { - tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex]; - s++; - } + int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; + tasks->p2pSendOrder[i] = sendRank; + i++; } index++; if (index == 1 && deltas[1] == deltas[0]) index++; @@ -1346,7 +1486,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm goto sched_delta; } } - assert(s == nRanks && r == nRanks); + assert(i == tasks->p2pOrderSteps); } while (0); if (ncclParamNvbPreconnect()) { @@ -1358,13 +1498,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int channelId; for (int c=0; cp2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer].send[1].connected == 0) { + if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { comm->connectSend[peer] |= (1UL<p2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { + if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { comm->connectRecv[peer] |= (1UL<rank, &proxyConn), ret, fail); - NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); + tpProxyRank = comm->topParentRanks[comm->rank]; + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); // Then to remote ones when using PXN if (ncclPxnDisable(comm) == 0) { int nranks; NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); for (int r=0; rp2pnChannels, sizeof(int), NULL, 0), ret, fail); + tpProxyRank = comm->topParentRanks[pxnPeers[r]]; + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } @@ -1416,8 +1558,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm exit: if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - // Unlink proxy shm to make sure it will be properly cleaned up. - ncclProxyShmUnlink(comm); + /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can + * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be + * properly cleaned up. */ + if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); @@ -1445,11 +1589,15 @@ NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT); struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; - ncclComm_t* newcomm; + struct ncclComm* comm; + struct ncclComm** newcomm; + int cudaDev; + // For ncclCommInitRank int nranks, myrank; ncclUniqueId commId; - int cudaDev; - int virtualId; + // for ncclCommSplit + struct ncclComm* parent; + int color, key; }; struct ncclCommFinalizeAsyncJob { @@ -1457,26 +1605,71 @@ struct ncclCommFinalizeAsyncJob { ncclComm_t comm; }; +NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); + +static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { + int* colors = NULL; + int* keys = NULL; + int nRanks = 0, myRank = 0; + ncclResult_t ret = ncclSuccess; + + NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail); + + // Compute nRanks, my rank and the ranks (of the original comm) before and after me + colors[parent->rank] = color; + keys[parent->rank] = key; + NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail); + NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail); + + // Negative color does not create a new comm. Return now. + if (color == NCCL_SPLIT_NOCOLOR) goto exit; + + memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks); + for (int i = 0; i < parent->nRanks; i++) { + if (colors[i] != color) continue; + // Find where to insert this rank + int insert = 0; + while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++; + // Shift ranks by one after insert + for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1]; + // Insert our rank + parentRanksRet[insert] = i; + nRanks++; + } + + for (int i = 0; i < nRanks; i++) { + if (parentRanksRet[i] == parent->rank) myRank = i; + } + + *nRanksRet = nRanks; + *myRankRet = myRank; + +exit: + free(colors); + free(keys); + return ret; +fail: + goto exit; +} + static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; - ncclComm_t* newcomm = job->newcomm; - ncclComm_t comm = *newcomm; - int nranks = job->nranks; - ncclUniqueId commId = job->commId; // C++ struct assignment - int myrank = job->myrank; - int cudaDev = job->cudaDev; - int virtualId = job->virtualId; + ncclComm_t comm = job->comm; + ncclResult_t res = ncclSuccess; int archMajor, archMinor; size_t maxLocalSizeBytes = 0; - ncclResult_t res = ncclSuccess; + int cudaDev = job->cudaDev; + int* parentRanks = NULL; + int cudaArch; int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); - CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev)); - CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev)); - comm->cudaArch = 100*archMajor + 10*archMinor; + CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); + CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); + cudaArch = 100*archMajor + 10*archMinor; - NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes)); + NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) #ifdef USE_INDIRECT_FUNCTION_CALL @@ -1485,18 +1678,49 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize)); } #endif - NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank, virtualId), res, fail); - NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, fail); + + if (job->parent) { + NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); + NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); + // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. + if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; + snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color); + NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); + NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail); + } else { + NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); + NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); + } + + comm->cudaArch = cudaArch; + comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); + + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); + + NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail); // update communicator state comm->initState = ncclSuccess; // Trace this call for replay tool - TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", - *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev); + if (job->parent) { + /* unlink child abort flag. */ + __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE); + TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", + job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); + } else { + TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", + comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev); + } - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %zi used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, maxLocalSizeBytes, allocTracker[(*newcomm)->cudaDev].totalAllocSize); + + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx commId 0x%llx localSize %zi used %ld bytes - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, (unsigned long long)hashUniqueId(job->commId), maxLocalSizeBytes, allocTracker[comm->cudaDev].totalAllocSize); exit: + if (job->newcomm) { + /* assign it to user pointer. */ + __atomic_store_n(job->newcomm, comm, __ATOMIC_RELEASE); + } + free(parentRanks); return res; fail: comm->initState = res; @@ -1510,14 +1734,88 @@ fail: INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \ } -static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { +static ncclResult_t envConfigOverride(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; - /* config must not be NULL in this function */ + const char* tmpNetName = comm->config.netName; + const char* envNetName; int blockingEnv; int cgaClusterSizeEnv; int minCTAsEnv; int maxCTAsEnv; - const char *envNetName, *tmpNetName; + int splitShareEnv; + + /* override configuration from env variable. */ + blockingEnv = ncclParamCommBlocking(); + if (blockingEnv == 0 || blockingEnv == 1) + comm->config.blocking = blockingEnv; + + cgaClusterSizeEnv = ncclParamCGAClusterSize(); + if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { + comm->config.cgaClusterSize = cgaClusterSizeEnv; + } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { + WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); + comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; + } + + minCTAsEnv = ncclParamMinCTAs(); + if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.minCTAs = minCTAsEnv; + } + + maxCTAsEnv = ncclParamMaxCTAs(); + if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.maxCTAs = maxCTAsEnv; + } + + envNetName = getenv("NCCL_NET"); + if (envNetName) + tmpNetName = envNetName; + if (tmpNetName != NULL) { + int netNameLen = strlen(tmpNetName) + 1; + comm->config.netName = (char*)malloc(netNameLen); + memcpy((void*)comm->config.netName, tmpNetName, netNameLen); + } else { + comm->config.netName = NULL; + } + + splitShareEnv = ncclParamCommSplitShareResources(); + if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.splitShare = splitShareEnv; + } + + /* cap channels if needed */ + if (comm->config.minCTAs > MAXCHANNELS) { + WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); + comm->config.minCTAs = MAXCHANNELS; + } + + if (comm->config.maxCTAs > MAXCHANNELS) { + WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); + comm->config.maxCTAs = MAXCHANNELS; + } + + if (comm->config.minCTAs > comm->config.maxCTAs) { + WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); + comm->config.minCTAs = comm->config.maxCTAs; + } + + if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { + WARN("splitShare %d is not a valid value 0/1, set it to 0\n", comm->config.splitShare); + comm->config.splitShare = 0; + } + + return ret; +} + +static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) { + memcpy(&childComm->config, &parnet->config, sizeof(ncclConfig_t)); + NCCLCHECK(envConfigOverride(childComm)); + return ncclSuccess; +} + +static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { + ncclResult_t ret = ncclSuccess; + /* config must not be NULL in this function */ ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr; @@ -1570,71 +1868,29 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { goto fail; } + if (internalConfigPtr->splitShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->splitShare != 0 && internalConfigPtr->splitShare != 1) { + WARN("Invalid config splitShare attribute value %d", internalConfigPtr->splitShare); + ret = ncclInvalidArgument; + goto fail; + } + /* default config value can be tuned on different platform. */ NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); - - tmpNetName = internalConfigPtr->netName; + NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); /* assign config to communicator */ - comm->blocking = internalConfigPtr->blocking; - comm->cgaClusterSize = internalConfigPtr->cgaClusterSize; - comm->minCTAs = internalConfigPtr->minCTAs; - comm->maxCTAs = internalConfigPtr->maxCTAs; + comm->config.blocking = internalConfigPtr->blocking; + comm->config.cgaClusterSize = internalConfigPtr->cgaClusterSize; + comm->config.minCTAs = internalConfigPtr->minCTAs; + comm->config.maxCTAs = internalConfigPtr->maxCTAs; + comm->config.netName = internalConfigPtr->netName; + comm->config.splitShare = internalConfigPtr->splitShare; - /* override configuration from env variable. */ - blockingEnv = ncclParamCommBlocking(); - if (blockingEnv == 0 || blockingEnv == 1) - comm->blocking = blockingEnv; - - cgaClusterSizeEnv = ncclParamCGAClusterSize(); - if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { - comm->cgaClusterSize = cgaClusterSizeEnv; - } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { - WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); - comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; - } - - minCTAsEnv = ncclParamMinCTAs(); - if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->minCTAs = minCTAsEnv; - } - - maxCTAsEnv = ncclParamMaxCTAs(); - if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->maxCTAs = maxCTAsEnv; - } - - /* cap channels if needed */ - if (comm->minCTAs > MAXCHANNELS) { - WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS); - comm->minCTAs = MAXCHANNELS; - } - - if (comm->maxCTAs > MAXCHANNELS) { - WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS); - comm->maxCTAs = MAXCHANNELS; - } - - if (comm->minCTAs > comm->maxCTAs) { - WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs); - ret = ncclInvalidArgument; - goto fail; - } - - envNetName = getenv("NCCL_NET"); - if (envNetName) - tmpNetName = envNetName; - if (tmpNetName != NULL) { - int netNameLen = strlen(tmpNetName) + 1; - comm->netName = (char*)malloc(netNameLen); - memcpy(comm->netName, tmpNetName, netNameLen); - } else { - comm->netName = NULL; - } + NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); exit: return ret; @@ -1642,13 +1898,7 @@ fail: goto exit; } -static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) { - struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; - ncclCommDestroy(*job->newcomm); - *job->newcomm = nullptr; -} - -static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config, int virtualId) { +static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) { ncclResult_t res = ncclSuccess; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob *job = NULL; @@ -1675,18 +1925,19 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail); NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail); + *comm->abortFlagRefCount = 1; NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ comm->initState = ncclInternalError; *newcomm = comm; NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); - job->newcomm = newcomm; + job->comm = comm; job->nranks = nranks; job->commId = commId; // C++ struct assignment job->myrank = myrank; job->cudaDev = cudaDev; - job->virtualId = virtualId; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: @@ -1694,6 +1945,7 @@ exit: fail: if (comm) { if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag); + if (comm->abortFlagRefCount) free(comm->abortFlagRefCount); free(comm); } if (newcomm) *newcomm = NULL; @@ -1724,27 +1976,10 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, -1)); + NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config)); return ncclSuccess; } -NCCL_API(ncclResult_t, ncclCommInitRankMulti, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId); -ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId) { - // Load the CUDA driver and dlsym hooks (can fail on old drivers) - if (ncclParamDmaBufEnable()) rocmLibraryInit(); - - int cudaDev; - ncclConfig_t config = NCCL_CONFIG_INITIALIZER; - CUDACHECK(hipGetDevice(&cudaDev)); - - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) - - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, virtualId)); - return ncclSuccess; -} - - NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { ncclResult_t ret = ncclSuccess; @@ -1794,7 +2029,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { NCCLCHECKGOTO(ncclGroupStart(), ret, fail); for (int i=0; iblocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); return ret; fail: - if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommSetAsyncError(*newcomm, ret); + if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); goto exit; } @@ -1856,8 +2091,8 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); if (comm->initState == ncclSuccess) { - NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->deviceStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail); } NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. @@ -1953,10 +2188,10 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (comm && !comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; + if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; return ret; fail: - if (comm && !comm->blocking) (void) ncclCommSetAsyncError(comm, ret); + if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } @@ -2001,10 +2236,10 @@ static ncclResult_t commReclaim(ncclComm_t comm) { } } - /* ncclProxyDestroy() loop must be put after commDestroySync() loop. Namely, you cannot do: + /* ncclProxyStop() loop must be put after commDestroySync() loop. Namely, you cannot do: * while(...) { * commDestroySync(...); - * ncclProxyDestroy(...); + * ncclProxyStop(...); * } * Considering one process multi-gpu case, we must guarantee all kernels are complete before * we free proxy resources; otherwise, we will face invalid memory issues where proxy connection @@ -2019,7 +2254,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) { nextIntraComm = nextIntraComm->intraNext; /* free intraprocess proxy resources. */ - if ((ret = ncclProxyDestroy(curIntraComm)) != ncclSuccess) { + if ((ret = ncclProxyStop(curIntraComm)) != ncclSuccess) { WARN("commReclaim: comm %p (rank = %d) destroys proxy resource error %d", curIntraComm, curRank, ret); } } @@ -2080,6 +2315,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { return ncclSuccess; } + volatile uint32_t* childAbortFlag; int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; @@ -2089,6 +2325,10 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); // Ask anything that might still be running on the device to quit + childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE); + if (childAbortFlag != NULL) { + *childAbortFlag = 1; + } *comm->abortFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ @@ -2100,15 +2340,78 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { return ncclSuccess; } +NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { + struct ncclCommInitRankAsyncJob *job = NULL; + struct ncclComm* childComm = NCCL_COMM_NULL; + ncclResult_t res = ncclSuccess; + + NCCLCHECK(ncclGroupStartInternal()); + NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail); + NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); + + /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ + *newcomm = NCCL_COMM_NULL; + if (color == NCCL_SPLIT_NOCOLOR) { + INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank); + } else { + NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail); + if (comm->config.splitShare) { + childComm->abortFlag = comm->abortFlag; + childComm->abortFlagRefCount = comm->abortFlagRefCount; + comm->childAbortFlag = NULL; + ncclAtomicRefCountIncrement(comm->abortFlagRefCount); + } else { + NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&childComm->abortFlag, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc((uint32_t**)&childComm->abortFlagRefCount, 1), res, fail); + /* temporarily used to abort everything during child comm init. */ + comm->childAbortFlag = childComm->abortFlag; + *childComm->abortFlagRefCount = 1; + } + if (config == NULL) { + NCCLCHECKGOTO(copyCommConfig(childComm, comm), res, fail); + } else { + NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail); + } + + /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ + childComm->initState = ncclInternalError; + } + + NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); + job->comm = childComm; + job->newcomm = newcomm; + job->parent = comm; + job->color = color; + job->key = key; + job->cudaDev = comm->cudaDev; + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); + +exit: + ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); + return res; +fail: + if (childComm) { + if (comm && !comm->config.splitShare) { + if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag); + if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount); + } + free(childComm); + } + if (newcomm) *newcomm = NULL; + goto exit; +} + NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { case ncclSuccess : return "no error"; - case ncclUnhandledCudaError : return "unhandled cuda error"; - case ncclSystemError : return "unhandled system error"; - case ncclInternalError : return "internal error"; - case ncclInvalidArgument : return "invalid argument"; - case ncclInvalidUsage : return "invalid usage"; + case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; + case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; + case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; + case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; + case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; case ncclRemoteError : return "remote process exited or there was a network error"; case ncclInProgress : return "NCCL operation in progress"; default : return "unknown result code"; diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 4fe90237ce..334ee10f69 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -6,10 +6,46 @@ #include "nccl.h" #include "debug.h" +#include "param.h" #include "cudawrap.h" #include +// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage +NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0); + +static int ncclCuMemSupported = 0; + +// Determine whether CUMEM & VMM RDMA is supported on this platform +int ncclIsCuMemSupported() { +#if CUDART_VERSION < 11030 + return 0; +#else + CUdevice currentDev; + int cudaDev; + int cudaDriverVersion; + int flag = 0; + ncclResult_t ret = ncclSuccess; + CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error); + if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support + CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); + if (CUPFN(cuMemCreate) == NULL) return 0; + CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error); + // Query device to see if CUMEM VMM support is available + CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error); + if (!flag) return 0; + // Query device to see if CUMEM RDMA support is available + CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error); + if (!flag) return 0; +error: + return (ret == ncclSuccess); +#endif +} + +int ncclCuMemEnable() { + return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable()); +} + #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 @@ -35,6 +71,7 @@ DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020); DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020); DECLARE_CUDA_PFN(cuMemMap, 10020); DECLARE_CUDA_PFN(cuMemRelease, 10020); +DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000); DECLARE_CUDA_PFN(cuMemSetAccess, 10020); DECLARE_CUDA_PFN(cuMemUnmap, 10020); #if CUDA_VERSION >= 11070 @@ -89,7 +126,6 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuCtxSetCurrent, 4000, 1); LOAD_SYM(cuCtxGetDevice, 2000, 1); /* cuMem API support */ -#if CUDA_VERSION >= 11030 LOAD_SYM(cuMemAddressReserve, 10020, 1); LOAD_SYM(cuMemAddressFree, 10020, 1); LOAD_SYM(cuMemCreate, 10020, 1); @@ -98,9 +134,9 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1); LOAD_SYM(cuMemMap, 10020, 1); LOAD_SYM(cuMemRelease, 10020, 1); + LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1); LOAD_SYM(cuMemSetAccess, 10020, 1); LOAD_SYM(cuMemUnmap, 10020, 1); -#endif #if CUDA_VERSION >= 11070 LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support #endif @@ -135,7 +171,7 @@ static void initOnceFunc() { if (ncclCudaPath == NULL) snprintf(path, 1024, "%s", "libcuda.so"); else - snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so"); + snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so"); (void) dlerror(); // Clear any previous errors cudaLib = dlopen(path, RTLD_LAZY); @@ -195,6 +231,9 @@ static void initOnceFunc() { } #endif + // Determine whether we support the cuMem APIs or not + ncclCuMemSupported = ncclIsCuMemSupported(); + initResult = ncclSuccess; return; error: diff --git a/src/misc/ibvsymbols.cc b/src/misc/ibvsymbols.cc new file mode 100644 index 0000000000..c41a457c8f --- /dev/null +++ b/src/misc/ibvsymbols.cc @@ -0,0 +1,158 @@ +#include +#include + +#include "ibvsymbols.h" + +#ifdef NCCL_BUILD_RDMA_CORE +/* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */ + +#define ASSIGN_SYM(container, symbol, name) container->name= &symbol; + +// Passthrough function for ibv_reg_mr macro in verbs.h +struct ibv_mr* ibv_internal_reg_mr( + struct ibv_pd* pd, + void* addr, + size_t length, + int access) { + return ibv_reg_mr(pd, addr, length, access); + } + +// Passthrough function for ibv_internal_query_port macro in verbs.h +int ibv_internal_query_port( + struct ibv_context* context, + uint8_t port_num, + struct ibv_port_attr* port_attr) { + return ibv_query_port(context, port_num, port_attr); +} + +ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { + ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list); + ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list); + ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name); + ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device); + ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device); + ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event); + ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event); + ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device); + ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid); + ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp); + ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd); + ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd); + + ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2); + ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr); + + ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr); + ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq); + ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq); + ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp); + ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp); + ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp); + ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init); + ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str); + + ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr; + ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port; + + return ncclSuccess; +} + +#else +/* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */ + +#include +#include "core.h" + +// IBVERBS Library versioning +#define IBVERBS_VERSION "IBVERBS_1.1" + +ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { + static void* ibvhandle = NULL; + void* tmp; + void** cast; + + ibvhandle=dlopen("libibverbs.so", RTLD_NOW); + if (!ibvhandle) { + ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW); + if (!ibvhandle) { + INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]"); + goto teardown; + } + } + +#define LOAD_SYM(handle, symbol, funcptr) do { \ + cast = (void**)&funcptr; \ + tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ + if (tmp == NULL) { \ + WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \ + goto teardown; \ + } \ + *cast = tmp; \ + } while (0) + +// Attempt to load a specific symbol version - fail silently +#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ + cast = (void**)&funcptr; \ + *cast = dlvsym(handle, symbol, version); \ + } while (0) + + LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list); + LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list); + LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name); + LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device); + LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device); + LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event); + LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event); + LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device); + LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port); + LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid); + LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp); + LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd); + LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd); + LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr); + // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 + LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); + // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12 + LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12"); + LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr); + LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq); + LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq); + LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp); + LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp); + LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp); + LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init); + LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str); + + return ncclSuccess; + +teardown: + ibvSymbols->ibv_internal_get_device_list = NULL; + ibvSymbols->ibv_internal_free_device_list = NULL; + ibvSymbols->ibv_internal_get_device_name = NULL; + ibvSymbols->ibv_internal_open_device = NULL; + ibvSymbols->ibv_internal_close_device = NULL; + ibvSymbols->ibv_internal_get_async_event = NULL; + ibvSymbols->ibv_internal_ack_async_event = NULL; + ibvSymbols->ibv_internal_query_device = NULL; + ibvSymbols->ibv_internal_query_port = NULL; + ibvSymbols->ibv_internal_query_gid = NULL; + ibvSymbols->ibv_internal_query_qp = NULL; + ibvSymbols->ibv_internal_alloc_pd = NULL; + ibvSymbols->ibv_internal_dealloc_pd = NULL; + ibvSymbols->ibv_internal_reg_mr = NULL; + ibvSymbols->ibv_internal_reg_mr_iova2 = NULL; + ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL; + ibvSymbols->ibv_internal_dereg_mr = NULL; + ibvSymbols->ibv_internal_create_cq = NULL; + ibvSymbols->ibv_internal_destroy_cq = NULL; + ibvSymbols->ibv_internal_create_qp = NULL; + ibvSymbols->ibv_internal_modify_qp = NULL; + ibvSymbols->ibv_internal_destroy_qp = NULL; + ibvSymbols->ibv_internal_fork_init = NULL; + ibvSymbols->ibv_internal_event_type_str = NULL; + + if (ibvhandle != NULL) dlclose(ibvhandle); + return ncclSystemError; +} + +#endif diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 8a736d3cf1..bc896e10eb 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -8,314 +8,186 @@ #include #include -#include -#include "core.h" - -/*Function Pointers*/ -int (*ibv_internal_fork_init)(void); -struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); -void (*ibv_internal_free_device_list)(struct ibv_device **list); -const char * (*ibv_internal_get_device_name)(struct ibv_device *device); -struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); -int (*ibv_internal_close_device)(struct ibv_context *context); -int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); -void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); -int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); -int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); -int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); -int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); -struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); -int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); -struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); -struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); -/* DMA-BUF support */ -struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); -int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); -struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); -int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); -struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); -int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); -int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); -const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); - -// IBVERBS Library versioning -#define IBVERBS_VERSION "IBVERBS_1.1" +#include "ibvsymbols.h" static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; - -static void initOnceFunc(void) { - static void* ibvhandle = NULL; - void* tmp; - void** cast; - - ibvhandle=dlopen("libibverbs.so", RTLD_NOW); - if (!ibvhandle) { - ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW); - if (!ibvhandle) { - INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]"); - goto teardown; - } - } - -#define LOAD_SYM(handle, symbol, funcptr) do { \ - cast = (void**)&funcptr; \ - tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ - if (tmp == NULL) { \ - WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \ - goto teardown; \ - } \ - *cast = tmp; \ - } while (0) - -// Attempt to load a specific symbol version - fail silently -#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ - cast = (void**)&funcptr; \ - *cast = dlvsym(handle, symbol, version); \ - } while (0) - - LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list); - LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list); - LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name); - LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device); - LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device); - LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event); - LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event); - LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device); - LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port); - LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid); - LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp); - LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd); - LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd); - LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr); - // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 - LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); - // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12 - LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12"); - LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr); - LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq); - LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq); - LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp); - LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp); - LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp); - LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init); - LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str); - - initResult = ncclSuccess; - return; - -teardown: - ibv_internal_get_device_list = NULL; - ibv_internal_free_device_list = NULL; - ibv_internal_get_device_name = NULL; - ibv_internal_open_device = NULL; - ibv_internal_close_device = NULL; - ibv_internal_get_async_event = NULL; - ibv_internal_ack_async_event = NULL; - ibv_internal_query_device = NULL; - ibv_internal_query_port = NULL; - ibv_internal_query_gid = NULL; - ibv_internal_query_qp = NULL; - ibv_internal_alloc_pd = NULL; - ibv_internal_dealloc_pd = NULL; - ibv_internal_reg_mr = NULL; - ibv_internal_reg_mr_iova2 = NULL; - ibv_internal_reg_dmabuf_mr = NULL; - ibv_internal_dereg_mr = NULL; - ibv_internal_create_cq = NULL; - ibv_internal_destroy_cq = NULL; - ibv_internal_create_qp = NULL; - ibv_internal_modify_qp = NULL; - ibv_internal_destroy_qp = NULL; - ibv_internal_fork_init = NULL; - ibv_internal_event_type_str = NULL; - - if (ibvhandle != NULL) dlclose(ibvhandle); - initResult = ncclSystemError; - return; -} +struct ncclIbvSymbols ibvSymbols; ncclResult_t wrap_ibv_symbols(void) { - pthread_once(&initOnceControl, initOnceFunc); + pthread_once(&initOnceControl, + [](){ initResult = buildIbvSymbols(&ibvSymbols); }); return initResult; } -#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \ - if (name_internal == NULL) { \ +/* CHECK_NOT_NULL: helper macro to check for NULL symbol */ +#define CHECK_NOT_NULL(container, internal_name) \ + if (container.internal_name == NULL) { \ WARN("lib wrapper not initialized."); \ return ncclInternalError; \ - } \ - retval = call; \ + } + +#define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed with error %s", strerror(errno)); \ return ncclSystemError; \ } \ return ncclSuccess; -#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \ - if (name_internal == NULL) { \ - WARN("lib wrapper not initialized."); \ - return ncclInternalError; \ - } \ - retval = call; \ +#define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; -#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \ - if (name_internal == NULL) { \ - WARN("lib wrapper not initialized."); \ - return ncclInternalError; \ - } \ - int ret = call; \ +#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + int ret = container.call; \ if (ret != success_retval) { \ WARN("Call to " name " failed with error %s", strerror(ret)); \ return ncclSystemError; \ } \ return ncclSuccess; -#define IBV_INT_CHECK(name_internal, call, error_retval, name) \ - if (name_internal == NULL) { \ - WARN("lib wrapper not initialized."); \ - return ncclInternalError; \ - } \ - int ret = call; \ +#define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + int ret = container.call; \ if (ret == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; -#define IBV_PASSTHRU(name_internal, call) \ - if (name_internal == NULL) { \ - WARN("lib wrapper not initialized."); \ - return ncclInternalError; \ - } \ - call; \ +#define IBV_PASSTHRU(container, internal_name, call) \ + CHECK_NOT_NULL(container, internal_name); \ + container.call; \ return ncclSuccess; ncclResult_t wrap_ibv_fork_init() { - IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init"); + IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init"); } ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) { - *ret = ibv_internal_get_device_list(num_devices); + *ret = ibvSymbols.ibv_internal_get_device_list(num_devices); if (*ret == NULL) *num_devices = 0; return ncclSuccess; } ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) { - IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list)); + IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list)); } const char *wrap_ibv_get_device_name(struct ibv_device *device) { - if (ibv_internal_get_device_name == NULL) { + if (ibvSymbols.ibv_internal_get_device_name == NULL) { WARN("lib wrapper not initialized."); exit(-1); } - return ibv_internal_get_device_name(device); + return ibvSymbols.ibv_internal_get_device_name(device); } ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/ - IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device"); + IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device"); } ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/ - IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device"); + IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device"); } ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/ - IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event"); + IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event"); } ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) { - IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event)); + IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event)); } ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device"); } ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); } ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { - IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid"); } ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { - IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp"); } ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) { - IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); } ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd"); } ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) { - IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); } struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { - if (ibv_internal_reg_mr == NULL) { + if (ibvSymbols.ibv_internal_reg_mr == NULL) { WARN("lib wrapper not initialized."); return NULL; } - return ibv_internal_reg_mr(pd, addr, length, access); + return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access); } ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { - if (ibv_internal_reg_mr_iova2 == NULL) { + if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) { return ncclInternalError; } if (ret == NULL) { return ncclSuccess; } // Assume dummy call - IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); } /* DMA-BUF support */ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { - IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); } struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { - if (ibv_internal_reg_dmabuf_mr == NULL) { + if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) { errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set return NULL; } - return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); + return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); } ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); } ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { - IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); } ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) { - IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq"); } ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) { - IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp"); } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { - IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); + IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); } ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); } ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { - *ret = (char *) ibv_internal_event_type_str(event); + *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event); return ncclSuccess; } diff --git a/src/misc/msccl/msccl_setup.cc b/src/misc/msccl/msccl_setup.cc index b815d96fde..c8ddbe477c 100644 --- a/src/misc/msccl/msccl_setup.cc +++ b/src/misc/msccl/msccl_setup.cc @@ -106,7 +106,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) { proxyOp.pattern = 0; proxyOp.root = 0; proxyOp.nbytes = status.stepSize*proxyOp.sliceSteps; - proxyOp.opCount = comm->collOpCount; + proxyOp.opCount = comm->sharedRes->collOpCount; int nLoops = (int)(DIVUP(status.nBytes, (size_t)((size_t)hostAlgo->nChunksPerLoop*(size_t)status.chunkEffectiveSize))); int nLoopsChunkSteps = nLoops * status.chunkSteps; for (int ch = 0; ch < hostAlgo->nChannels; ch++) { @@ -123,7 +123,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) { } proxyOp.nsteps = nLoopsChunkSteps * nRecvs; if (proxyOp.nsteps > 0) { - NCCLCHECK(mscclSaveProxy(ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0)); + NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0)); } } for (int i=0; inSendPeers; i++){ @@ -136,12 +136,12 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) { } proxyOp.nsteps = nLoopsChunkSteps * nSends; if (proxyOp.nsteps > 0) { - NCCLCHECK(mscclSaveProxy(ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0)); + NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0)); } } } NCCLCHECK(ncclProxyStart(comm)); - comm->collOpCount++; + comm->sharedRes->collOpCount++; return ncclSuccess; } diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc index e32038955d..71ba5b3ab8 100644 --- a/src/misc/rocmwrap.cc +++ b/src/misc/rocmwrap.cc @@ -170,4 +170,6 @@ error: return ncclSystemError; } - +int ncclCuMemEnable() { + return 0; +} diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index 69f7b1bde0..ce05c3ef3e 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -14,6 +14,7 @@ #include #include #include +#include struct shmHandleInternal { int fd; @@ -31,7 +32,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS handle->devShmPtr = dptr; handle->shmSize = shmSize; handle->realShmSize = realShmSize; - handle->refcount = (int*)(hptr + shmSize); + handle->refcount = (hptr != NULL) ? (int*)(hptr + shmSize) : NULL; if (create) { int slen = strlen(shmPath); handle->shmPath = (char*)malloc(slen + 1); @@ -80,23 +81,20 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de if (hptr == MAP_FAILED) { WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno)); ret = ncclSystemError; + hptr = NULL; goto fail; } if (create) { *(int*)(hptr + shmSize) = refcount; } else { - int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED); + int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize)); if (remref == 0) { /* the last peer has completed attachment, it should unlink the shm mem file. */ if (unlink(shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno)); } } - - if (refcount != -1) { - WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount); - } } if (devShmPtr) { @@ -128,13 +126,13 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) { if (tmphandle) { if (tmphandle->fd >= 0) { close(tmphandle->fd); - if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) { + if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno)); ret = ncclSystemError; } - free(tmphandle->shmPath); } + free(tmphandle->shmPath); } if (tmphandle->shmPtr) { diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 6d934c4bd6..612498c884 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -419,7 +419,7 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { - WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno); + WARN("socketTryAccept: Accept failed: %s", strerror(errno)); return ncclSystemError; } return ncclSuccess; @@ -429,6 +429,9 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { uint64_t magic; enum ncclSocketType type; int received = 0; + const int one = 1; + SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (received == 0) return ncclSuccess; NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); diff --git a/src/nccl.h.in b/src/nccl.h.in index 6047b2f21d..e4ebb92a21 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -23,7 +23,6 @@ #define RCCL_BFLOAT16 1 #define RCCL_GATHER_SCATTER 1 #define RCCL_ALLTOALLV 1 -#define RCCL_MULTIRANKPERGPU 1 #ifdef __cplusplus extern "C" { @@ -50,6 +49,7 @@ typedef enum { ncclSuccess = 0, #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ @@ -64,6 +64,7 @@ typedef struct ncclConfig_v21700 { int minCTAs; int maxCTAs; const char *netName; + int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -76,7 +77,8 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ - NCCL_CONFIG_UNDEF_PTR /* netName */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. @@ -131,28 +133,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /// @endcond -/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device. - - @details - rank must be between 0 and nranks-1 and unique within a communicator clique. - Each rank is associated to a HIP device, which has to be set before calling - ncclCommInitRankMulti. - Since this version of the function allows multiple ranks to utilize the same - HIP device, a unique virtualId per device has to be provided by each calling - rank. - ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be - called by different threads/processes or use ncclGroupStart/ncclGroupEnd. - - @param[in] - comm ncclComm_t* - communicator struct pointer - */ - ncclResult_t ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId); -/// @cond include_hidden - ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId); -/// @endcond - - /*! @brief Creates a clique of communicators (single process version). * * @details This is a convenience function to create a single-process communicator clique. @@ -191,6 +171,19 @@ ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /// @endcond +/*! @brief Creates one or more communicators from an existing one. + * Ranks with the same color will end up in the same communicator. + * Within the new communicator, key will be used to order ranks. + * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + * and will therefore return a NULL communicator. + * If config is NULL, the new communicator will inherit the original communicator's + * configuration*/ +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/// @cond include_hidden +ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/// @endcond + +/* Returns a string for each error code. */ /*! @brief Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); /// @cond include_hidden diff --git a/src/net.cc b/src/net.cc index d31a000202..2524d9c753 100644 --- a/src/net.cc +++ b/src/net.cc @@ -265,10 +265,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { ncclResult_t ncclNetInit(struct ncclComm* comm) { // Initialize main communication network - char* netName; + const char* netName; bool ok = false; - netName = comm->netName; + netName = comm->config.netName; for (int i=0; i<3; i++) { if (ncclNets[i] == nullptr) continue; enum ncclNetState state; @@ -309,27 +309,31 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { return ncclSuccess; } #endif - int netDevs; - NCCLCHECK(ncclNetDevices(comm, &netDevs)); - *gdrSupport = 0; - for (int dev=0; devcudaDev] == -1) { + int netDevs; + NCCLCHECK(comm->ncclNet->devices(&netDevs)); + gdrSupportMatrix[comm->cudaDev] = 0; + for (int dev=0; devncclNet->getProperties(dev, &props)); + if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - *gdrSupport = 1; - break; + gdrSupportMatrix[comm->cudaDev] = 1; + break; #endif // Allocate memory on the GPU and try to register it on the NIC. void *lComm = NULL, *sComm = NULL, *rComm = NULL; ncclNetHandle_t handle; - void* gpuPtr = NULL; + char* gpuPtr = NULL; void* mHandle = NULL; ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1); + NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); bool connected; connected = false; @@ -341,32 +345,34 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { } if (sComm == NULL) - NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2); + NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2); if (rComm == NULL) - NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2); + NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2); connected = (rComm != NULL) && (sComm != NULL); } - CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); - if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle)); - NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle)); - *gdrSupport = 1; + NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); + if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); + NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); + gdrSupportMatrix[comm->cudaDev] = 1; } ncclDebugNoWarn = 0; - CUDACHECK(cudaFree(gpuPtr)); + NCCLCHECK(ncclCudaFree(gpuPtr)); cleanup2: if (rComm != NULL) - NCCLCHECK(ncclNetCloseRecv(comm, rComm)); + NCCLCHECK(comm->ncclNet->closeRecv(rComm)); if (sComm != NULL) - NCCLCHECK(ncclNetCloseSend(comm, sComm)); - NCCLCHECK(ncclNetCloseListen(comm, lComm)); + NCCLCHECK(comm->ncclNet->closeSend(sComm)); + NCCLCHECK(comm->ncclNet->closeListen(lComm)); cleanup1: - break; + break; + } } + *gdrSupport = gdrSupportMatrix[comm->cudaDev]; return ncclSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index 74551365cd..c4b63ed346 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -65,6 +65,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi } memcpy(elem->respBuff, respBuff, respSize); + free(respBuff); elem->done = true; return ncclSuccess; } @@ -75,7 +76,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi return ncclInternalError; } -static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) { +static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize) { struct ncclExpectedProxyResponse* ex; NCCLCHECK(ncclCalloc(&ex, 1)); ex->opId = opId; @@ -84,10 +85,6 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v ex->respBuff = malloc(respSize); ex->respSize = respSize; ex->done = false; - if (respData) { - memcpy(ex->respBuff, respData, respDataSize); - ex->done = true; - } // Enqueue struct ncclExpectedProxyResponse* list = state->expectedResponses; @@ -440,10 +437,11 @@ ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextO return ncclSuccess; } -ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { - struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps; +static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { + int tpLocalRank = comm->topParentLocalRanks[comm->localRank]; + struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclInternalError; - proxyOps += proxyConn->localRank; + proxyOps += proxyConn->tpLocalRank; struct ncclProxyOpsPool* pool = proxyOps->pool; TIME_START(0); @@ -454,9 +452,9 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyOps->freeOp = op->next; } else { int freeOp; - while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield(); + while ((freeOp = pool->freeOps[tpLocalRank]) == -1) sched_yield(); int freeOpNew; - while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; + while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+tpLocalRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; opIndex = freeOp; op = pool->ops+opIndex; proxyOps->freeOp = op->next; @@ -501,13 +499,13 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* return ncclSuccess; } -static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { +static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; - struct ncclChannelPeer* peerComm = channel->peers+peer; + struct ncclChannelPeer* peerComm = channel->peers[peer]; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { - WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank, + WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); return ncclInternalError; } @@ -515,13 +513,13 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s if (justInquire) *justInquire = true; else { - NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op)); + NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op)); } return ncclSuccess; } -ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) { - NCCLCHECK(SaveProxy(channel, type, peer, op, connIndex, nullptr)); +ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) { + NCCLCHECK(SaveProxy(comm, channel, type, peer, op, connIndex, nullptr)); return ncclSuccess; } @@ -537,10 +535,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool case ncclPatternPipelineTo: { struct ncclRing* ring = &channel->ring; if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) { - NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire)); } if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) { - NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire)); } } break; case ncclPatternTreeUp: @@ -549,30 +547,42 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool if (op->pattern != ncclPatternTreeDown) { // Tree up struct ncclTree* tree = &channel->tree; for (int i=0; idown[i], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->down[i], op, 0, justInquire)); } - NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire)); } if (op->pattern != ncclPatternTreeUp) { // Tree down struct ncclTree* tree = &channel->tree; for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) { - NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire)); } - NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire)); } } break; case ncclPatternCollnetChain: { - NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetChain.up, op, 1, justInquire)); - NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire)); } break; case ncclPatternCollnetDirect: { - NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetDirect.out, op, 1, justInquire)); - NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire)); + } break; + case ncclPatternNvls: { + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire)); + } break; + case ncclPatternNvlsTree: { + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire)); } break; case ncclPatternSend: case ncclPatternRecv: { if (op->root == comm->rank) return ncclSuccess; - NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire)); + NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire)); } break; } return ncclSuccess; @@ -596,7 +606,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) info->chunkSize = stepSize; op->root = info->root; - struct ncclChannelPeer* peer = channel->peers + op->root; + struct ncclChannelPeer* peer = channel->peers[op->root]; if (info->coll == ncclFuncSend) { op->pattern = ncclPatternSend; if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) { @@ -665,13 +675,13 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr return ncclSuccess; } -static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { +static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; struct ncclProxyArgs* op = opStart; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); - NCCLCHECK(op->progress(comm, op)); + NCCLCHECK(op->progress(proxyState, op)); if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone) { @@ -688,8 +698,8 @@ static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressS NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16); -static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; +static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int* added) { + struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) return ncclInternalError; struct ncclProxyOpsPool* pool = state->opsPool; @@ -724,7 +734,7 @@ process_nextops: TIME_START(2); int freeOp[NCCL_MAX_LOCAL_RANKS]; int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; - for (int i=0; ilocalRanks; i++) freeOp[i] = -1; + for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1; uint64_t lastOpCount = 0; int lastPeer = -1; @@ -752,7 +762,7 @@ process_nextops: state->nextOps = opIndex; } - for (int i=0; ilocalRanks; i++) { + for (int i = 0; i < proxyState->tpLocalnRanks; i++) { if (freeOp[i] == -1) continue; int newFree = freeOp[i]; int oldFree = pool->freeOps[i]; @@ -784,7 +794,7 @@ void ncclDumpProxyState(int signal) { } NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0); -ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { +static int setProxyThreadContext(struct ncclProxyState* proxyState) { #if CUDART_VERSION >= 11030 static int createThreadContext = -1; @@ -798,44 +808,44 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) { } } if (createThreadContext) { - if (comm->proxyState.cudaCtx == NULL) { - if (CUPFN(cuCtxCreate(&comm->proxyState.cudaCtx, - CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) { - WARN("Failed to create CUDA context on device %d", comm->cudaDev); + if (proxyState->cudaCtx == NULL) { + if (CUPFN(cuCtxCreate(&proxyState->cudaCtx, + CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) { + WARN("Failed to create CUDA context on device %d", proxyState->cudaDev); createThreadContext = 0; - return ncclSuccess; } } else { - if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) { - WARN("Failed to set CUDA context on device %d", comm->cudaDev); - return ncclUnhandledCudaError; + if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) { + WARN("Failed to set CUDA context on device %d", proxyState->cudaDev); + return 0; } + return 1; } } #endif - return ncclSuccess; + return 0; } // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1); NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8); -void* ncclProxyProgress(void *comm_) { - struct ncclComm* comm = (struct ncclComm*)comm_; - if (ncclSetThreadContext(comm) != ncclSuccess) { - WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev); - } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { - WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev); +void* ncclProxyProgress(void *proxyState_) { + struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_; + if (setProxyThreadContext(proxyState)) { + INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev); + } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { + WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev); } - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - struct ncclProxyProgressState* state = &comm->proxyState.progressState; + struct ncclProxyProgressState* state = &proxyState->progressState; state->nextOps = -1; const int sig = ncclParamProxyDumpSignal(); if (sig != -1) signal(sig, ncclDumpProxyState); ncclLastProxyState = state; char threadName[NCCL_THREAD_NAMELEN]; - snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev); + snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", proxyState->cudaDev); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); int lastIdle = 0; @@ -846,11 +856,10 @@ void* ncclProxyProgress(void *comm_) { * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; struct ncclProxyArgs profArgs; // Only used for profiling purposes - while ((state->stop == false || (state->stop == true && state->active)) && *comm->abortFlag == 0) { + while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) { int idle = 1; - ncclResult_t ret = progressOps(comm, state, state->active, &idle); + ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { - (void) ncclCommSetAsyncError(comm, ret); INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } @@ -861,10 +870,9 @@ void* ncclProxyProgress(void *comm_) { proxyOpAppendCounter = 0; TIME_START(3); if (state->stop == false) - ret = ncclProxyGetPostedOps(comm, &added); + ret = ncclProxyGetPostedOps(proxyState, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { - (void) ncclCommSetAsyncError(comm, ret); INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); } if (added == 0) { @@ -877,11 +885,11 @@ void* ncclProxyProgress(void *comm_) { } ncclResult_t ncclProxyStart(struct ncclComm* comm) { - struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps; + struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclSuccess; TIME_START(1); - for (int r=0; rlocalRanks; r++) { - struct ncclProxyOps* ops = proxyOps+r; + for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) { + struct ncclProxyOps* ops = proxyOps + r; if (ops->pool == NULL || ops->nextOps == -1) continue; NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd)); ops->nextOps = ops->nextOpsEnd = -1; @@ -892,17 +900,17 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; +static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) { + struct ncclProxyProgressState* state = &proxyState->progressState; if (!state->thread) { - pthread_create(&state->thread, NULL, ncclProxyProgress, comm); - ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev); + pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState); + ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks); } return ncclSuccess; } -ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; +ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { + struct ncclProxyProgressState* state = &proxyState->progressState; // Request the proxy to stop and then wake it if (state->opsPool) { @@ -954,26 +962,26 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, return ncclSuccess; } -static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { if (connection->send) { if (ncclTransports[connection->transport]->send.proxyFree) { - NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm)); + NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, proxyState)); } } else { if (ncclTransports[connection->transport]->recv.proxyFree) { - NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm)); + NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, proxyState)); } } return ncclSuccess; } -static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) { +static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclProxyState* proxyState) { for (int b=0; bbanks; b++) { int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; for (int i=0; ipools[b]+i; if (connection->state != connUninitialized) { - NCCLCHECK(proxyFree(connection, comm)); + NCCLCHECK(proxyFree(connection, proxyState)); } } free(pool->pools[b]); @@ -984,122 +992,155 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo #include "transport.h" -ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) { +struct ncclProxyInitReq { + int transport; + int send; + int tpLocalRank; + int tpRank; + int sameProcess; +}; + +struct ncclProxyInitResp { + ncclProxyConnection* connection; + char devShmPath[6]; // "XXXXXX" - May or may not be set +}; + +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) { struct ncclSocket* sock; - int ready; - int type = ncclProxyMsgInit; + int ready, proxyRank = -1; + struct ncclProxyState* sharedProxyState = comm->proxyState; // Keep one connection per mlocal rank + for (int i = 0; i < comm->localRanks; ++i) { + /* find the proxy rank in comm. */ + if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) { + proxyRank = comm->localRankToRank[i]; + break; + } + } + proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + // Keep one connection per local rank proxyConn->connection = NULL; - proxyConn->rank = rank; - if (comm->proxyState.peerSocks == NULL) { - NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks)); - NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks)); - NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks)); - for (int i = 0; i < comm->localRanks; ++i) { - NCCLCHECK(ncclSocketSetFd(-1, &comm->proxyState.peerSocks[i])); + proxyConn->tpRank = tpProxyRank; + if (sharedProxyState->peerSocks == NULL) { + NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks)); + NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks)); + NCCLCHECK(ncclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks)); + for (int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) { + NCCLCHECK(ncclSocketSetFd(-1, &sharedProxyState->peerSocks[i])); } } - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank)); - sock = comm->proxyState.peerSocks + proxyConn->localRank; + proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank]; + sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; NCCLCHECK(ncclSocketReady(sock, &ready)); if (!ready) { - NCCLCHECK(ncclSocketInit(sock, comm->proxyState.peerAddresses+rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag)); + NCCLCHECK(ncclSocketInit(sock, sharedProxyState->peerAddresses+proxyConn->tpRank, comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(sock)); } - NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int))); - NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int))); - NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int))); - NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*))); - struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv; + + struct ncclProxyInitReq req = {0}; + req.transport = transport; + req.send = send; + req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; + req.tpRank = comm->topParentRanks[comm->rank]; + req.sameProcess = proxyConn->sameProcess; + + struct ncclProxyInitResp resp = {0}; + // This usually sends proxyConn->connection to identify which connection this is. + // However, this is part of the response and therefore is ignored + NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp))); + proxyConn->connection = resp.connection; + // If we need proxy progress, map progress ops + struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv; if (tcomm->proxyProgress) { char poolPath[] = "/dev/shm/nccl-XXXXXX"; - NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1)); - struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank; + strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1); + struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank; if (proxyOps->pool == NULL) { - NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle)); + NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle)); proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } - INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection); - proxyConn->comm = comm; + INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); return ncclSuccess; } +// cuMem API support +// The response is sent out-of-band using ncclIpcSocket for this specific command +ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) { + ncclResult_t ret = ncclSuccess; + ncclResult_t res = ncclInProgress; + struct ncclIpcSocket ipcSock = { 0 }; + void* opId = malloc(1); + // Create a UDS socket to receive the converted fd + NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag)); + + // Request the conversion of the fd over sockets + NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error); + + // Receive converted fd over UDS + NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd)); + TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd); + NCCLCHECK(ncclIpcSocketClose(&ipcSock)); + + while (res == ncclInProgress) { + res = ncclPollProxyResponse(comm, proxyConn, NULL, opId); + } + + free(opId); + return res; + +error: + NCCLCHECK(ncclIpcSocketClose(&ipcSock)); + WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank); + return ret; +} + const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" }; -ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { +ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; - void* respData = NULL; - int respDataSize = 0; - struct ncclComm* comm = proxyConn->comm; - struct ncclIpcSocket ipcSock = { 0 }; + struct ncclProxyState* sharedProxyState = comm->proxyState; - if (*comm->abortFlag != 0) { - WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response"); - return ncclInternalError; - } - if (comm->proxyState.peerSocks == NULL) return ncclInternalError; + if (sharedProxyState->peerSocks == NULL) return ncclInternalError; - sock = comm->proxyState.peerSocks + proxyConn->localRank; + sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; if (sock == NULL) return ncclInternalError; - if (type == ncclProxyMsgConvertFd) { - // cuMem API support - // Create a UDS socket to receive the converted fd - NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag)); - } - NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); - if (type == ncclProxyMsgConvertFd) { - // cuMem API support - int recvFd = -1; - if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError; - // Receive converted fd over UDS - NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd)); - TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd); - assert(recvFd != -1); - respData = &recvFd; - respDataSize = sizeof(recvFd); - NCCLCHECK(ncclIpcSocketClose(&ipcSock)); - } else { - // Send opId to proxy - NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error); - } + // Send opId to proxy + NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error); + // Add proxyOp to expected response queue - NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize)); + NCCLCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize)); return ncclSuccess; error: - NCCLCHECK(ncclIpcSocketClose(&ipcSock)); - WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]); return ret; } -ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) { - struct ncclComm* comm = proxyConn->comm; - +ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) { + struct ncclProxyState* sharedProxyState = comm->proxyState; // Receive the connection pointer from the Proxy if (*comm->abortFlag) { WARN("Comm %p is in abort state", comm); return ncclInternalError; } - if (comm->proxyState.peerSocks == NULL) return ncclInternalError; + if (sharedProxyState->peerSocks == NULL) return ncclInternalError; // Check response queue int found = 0; - NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found)); + NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found)); if (found == 0) { // Attempt to read in a new response header from the proxy thread - struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank; + struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; void* recvOpId; int offset = 0; @@ -1116,7 +1157,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)); } - INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId); + INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId); // Now do a blocking recv of the response size int respSize = 0; @@ -1124,17 +1165,22 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r // If there's a respSize to recv if (respSize > 0) { + if (recvOpId != opId) { + // Unexpected response, need to buffer the socket data + respBuff = malloc(respSize); + } + assert(respBuff != NULL); NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize)); } if (recvOpId == opId) { INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId); - NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId)); + NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId)); return ncclSuccess; } else { - INFO(NCCL_PROXY, "Queing opId=%p", recvOpId); + INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize); // Store the result and mark response as completed - NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize)); + NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize)); return ncclInProgress; } } else { @@ -1144,38 +1190,37 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r return ncclSuccess; } -ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { +ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { // Alloc some memory to act as a handle + ncclResult_t res = ncclSuccess; void* opId = malloc(1); - NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId)); - ncclResult_t res = ncclInProgress; + NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail); - while (res == ncclInProgress) { - res = ncclPollProxyResponse(proxyConn, respBuff, opId); - } + do { + res = ncclPollProxyResponse(comm, proxyConn, respBuff, opId); + } while (res == ncclInProgress); +exit: free(opId); - return res; +fail: + goto exit; } -static ncclResult_t proxyProgressInit(struct ncclComm* comm) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; +static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { + struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) { int size = sizeof(struct ncclProxyOpsPool); struct ncclProxyOpsPool* pool = NULL; - // The service thread may be launched already but localRanks may not be set yet. - while (comm->localRanks == 0) sched_yield(); - char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; shmPath[0] = '\0'; - NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, comm->localRanks + 1, &state->handle)); + NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle)); // Init pool pool->nextOps = -1; - for (int r=0; rlocalRanks; r++) { + for (int r = 0; r < proxyState->tpLocalnRanks; r++) { pool->freeOps[r] = r*MAX_OPS_PER_PEER; for (int i=0; iops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1; pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1; @@ -1194,20 +1239,20 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) { memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); // All ops structures are created, we can start the progress thread - NCCLCHECK(ncclProxyProgressCreate(comm)); + NCCLCHECK(ncclProxyProgressCreate(proxyState)); } return ncclSuccess; } -static void proxyOpsFree(struct ncclComm* comm) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; +static void proxyOpsFree(struct ncclProxyState* proxyState) { + struct ncclProxyProgressState* state = &proxyState->progressState; if (ncclShmClose(state->handle) != ncclSuccess) { WARN("[Service thread] shm close failed"); } } ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { - struct ncclProxyProgressState* state = &comm->proxyState.progressState; + struct ncclProxyProgressState* state = &comm->proxyState->progressState; if (state->opsPool == NULL) return ncclSuccess; if (ncclShmUnlink(state->handle) != ncclSuccess) { @@ -1216,97 +1261,75 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { return ncclSuccess; } -static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { - struct ncclSocket* sock = &peer->sock; +static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, ncclProxyInitReq* req, ncclProxyInitResp* resp, struct ncclProxyConnection** connection) { int id; - struct ncclProxyConnection* connection; NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); - NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection)); - connection->sock = sock; - NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int))); - connection->localRank = peer->localRank; - NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*))); - connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv; + NCCLCHECK(ncclProxyGetConnection(connectionPool, id, connection)); + + (*connection)->sock = &peer->sock; + (*connection)->transport = req->transport; + (*connection)->send = req->send; + (*connection)->tpLocalRank = req->tpLocalRank; + (*connection)->sameProcess = req->sameProcess; + peer->tpLocalRank = req->tpLocalRank; + peer->tpRank = req->tpRank; + + resp->connection = *connection; + + (*connection)->tcomm = (*connection)->send ? &ncclTransports[(*connection)->transport]->send : &ncclTransports[(*connection)->transport]->recv; // If we need proxy progress, let's allocate ops and start the thread - if (connection->tcomm->proxyProgress) { - NCCLCHECK(proxyProgressInit(comm)); - struct ncclProxyProgressState* state = &comm->proxyState.progressState; - NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); + if ((*connection)->tcomm->proxyProgress) { + NCCLCHECK(proxyProgressInit(proxyState)); + struct ncclProxyProgressState* state = &proxyState->progressState; + strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath)); } - INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport); - __atomic_store_n(&connection->state, connInitialized, __ATOMIC_RELEASE); - return ncclSuccess; -} - -static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { - struct ncclSocket* sock = &peer->sock; - struct ncclProxyConnection* connection; - NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*))); - int reqSize, respSize; - NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int))); - if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError; - int nChannels; - NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int))); - - // Store opId for completion response - void* opId; - NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId))); - INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId); - - if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels)); - __atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE); - - // Send the opId for referencing async operation - INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId); - NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId))); - - // Send the response size - INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize); - NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize))); - + INFO(NCCL_NET|NCCL_PROXY, "New proxy %s connection %d from local rank %d, transport %d", (*connection)->send ? "send":"recv", id, (*connection)->tpLocalRank, (*connection)->transport); + __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE); return ncclSuccess; } // cuMem API support -static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) { - struct ncclSocket* sock = &peer->sock; - uint64_t connection; - NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t))); - int reqSize, respSize; - NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int))); - NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int))); - if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError; - - int fd; +static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) { struct ncclIpcSocket ipcSock = { 0 }; - NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int))); + uint64_t hash = (uint64_t) opId; - INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection); + INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash); // Send back the converted fd using UDS - NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag)); - NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection)); + NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag)); + NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash)); NCCLCHECK(ncclIpcSocketClose(&ipcSock)); return ncclSuccess; } -static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) { +static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) { int done = 1; if (op->type == ncclProxyMsgSetup) { - INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId); - NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId); + NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); } else if (op->type == ncclProxyMsgConnect) { - INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff); - NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff); + NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + } else if (op->type == ncclProxyMsgSharedInit) { + int nChannels = (int) *op->reqBuff; + TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels); + if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels)); + __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE); + } else if (op->type == ncclProxyMsgConvertFd) { + int fd = *(int *)op->reqBuff; + TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd); + NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support + } else if (op->type == ncclProxyMsgInit) { + TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff); + NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection)); } else return ncclInternalError; + if (done) { + INFO(NCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize); if (op->type == ncclProxyMsgSetup) __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE); else if (op->type == ncclProxyMsgConnect) __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE); - /* if setup or connect is done, we should not return any error at this point since + /* if setup or connect is done, we should not return any error at this point since * ncclSocketSend might already send the respBuff to the requester. If we still choose * to abort and close the connection, it can cause segfault if the requester is using * the respBuff. */ @@ -1326,14 +1349,14 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC (*asyncOpCount)--; return ncclSuccess; - } else if (*comm->abortFlag != 0) { + } else if (*proxyState->abortFlag != 0) { return ncclInternalError; } return ncclInProgress; } -static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) { +static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) { struct ncclSocket* sock = &peer->sock; struct ncclProxyAsyncOp* asyncOp; NCCLCHECK(ncclCalloc(&asyncOp, 1)); @@ -1356,21 +1379,34 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p asyncProxyOpEnqueue(peer, asyncOp); (*asyncOpCount)++; - NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer)); + NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool)); return ncclSuccess; } #include -void* ncclProxyService(void* _args) { - struct ncclComm* comm = (struct ncclComm *) _args; - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - if (ncclSetThreadContext(comm) != ncclSuccess) { - WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev); - } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { - WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev); +static bool proxyMatchOpType(int type) { + switch (type) { + case ncclProxyMsgInit: + case ncclProxyMsgSharedInit: + case ncclProxyMsgSetup: + case ncclProxyMsgConnect: + case ncclProxyMsgConvertFd: + return true; + default: + return false; } - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); +} + +void* ncclProxyService(void* _args) { + struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; + // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + if (setProxyThreadContext(proxyState)) { + INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev); + } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { + WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev); + } + // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); // Prepare poll descriptor struct ncclProxyConnectionPool connectionPool; @@ -1385,7 +1421,7 @@ void* ncclProxyService(void* _args) { pollfds[s].fd = -1; pollfds[s].events = POLLHUP|POLLIN; } - if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { + if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { WARN("[Proxy Service] Get listenSock fd fails"); return NULL; }; @@ -1399,7 +1435,7 @@ void* ncclProxyService(void* _args) { /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer * connections. Need to wait until all other related comms call abort and safely exit * together, or we could face segmentation fault. */ - if (*comm->abortFlag != 0) stop = 1; + if (*proxyState->abortFlag != 0) stop = 1; /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { @@ -1421,7 +1457,7 @@ void* ncclProxyService(void* _args) { WARN("[Service thread] Initialize peers[%d].sock fails", s); return NULL; } - if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) { + if (ncclSocketAccept(&peers[s].sock, proxyState->listenSock) != ncclSuccess) { WARN("[Service thread] Accept failed %s", strerror(errno)); } else { if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) { @@ -1429,7 +1465,7 @@ void* ncclProxyService(void* _args) { return NULL; } npeers++; - peers[s].localRank = -1; + peers[s].tpLocalRank = -1; } } for (int s=0; sasyncOps; while (op != nullptr) { + ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */ type = op->type; - res = proxyProgressAsync(op, comm, &asyncOpCount, peer); + res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool); if (res == ncclSuccess || res == ncclInProgress) { - op = op->next; + op = opnext; } else { // Res is a bad result closeConn = 1; @@ -1460,10 +1497,10 @@ void* ncclProxyService(void* _args) { int closed; res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/); if (res != ncclSuccess && res != ncclInProgress) { - WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed); + WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed); closeConn = 1; } else if (closed) { - INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank); + INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank); closeConn = 1; } else if (res == ncclSuccess) { // We received something from the sock if (type == ncclProxyMsgStop) { @@ -1471,17 +1508,10 @@ void* ncclProxyService(void* _args) { closeConn = 1; } else if (type == ncclProxyMsgClose) { closeConn = 1; - } else if (type == ncclProxyMsgInit) { - res = proxyConnInit(peers+s, &connectionPool, comm); - } else if (type == ncclProxyMsgSharedInit) { - res = proxyConnSharedInit(peers+s, &connectionPool, comm); - } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) { - INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank); - res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount); - } else if (type == ncclProxyMsgConvertFd) { - res = proxyConvertFd(peers+s, comm); // cuMem API support + } else if (proxyMatchOpType(type)) { + res = proxyServiceInitOp(type, peers+s, &connectionPool, proxyState, &asyncOpCount); } else { - WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank); + WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank); closeConn = 1; } @@ -1491,7 +1521,7 @@ void* ncclProxyService(void* _args) { closeConn = 1; } if (res != ncclSuccess && res != ncclInProgress) { - WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res); + WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res); closeConn = 1; } @@ -1509,67 +1539,106 @@ void* ncclProxyService(void* _args) { } // Wait for all operations to complete and stop progress thread before freeing any resource - if (ncclProxyProgressDestroy(comm) != ncclSuccess) { + if (ncclProxyProgressDestroy(proxyState) != ncclSuccess) { WARN("[Proxy Service] proxyDestroy failed"); } for (int s=0; sproxyState.listenSock); - proxyOpsFree(comm); + ncclProxyFreeConnections(&connectionPool, proxyState); + ncclSocketClose(proxyState->listenSock); + free(proxyState->listenSock); + proxyOpsFree(proxyState); return NULL; } ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) { - comm->proxyState.listenSock = sock; - comm->proxyState.peerAddresses = peerAddresses; + assert(comm->sharedRes->proxyState == NULL); + NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1)); + comm->proxyState = comm->sharedRes->proxyState; + comm->proxyState->refCount = 1; + comm->proxyState->listenSock = sock; + comm->proxyState->peerAddresses = peerAddresses; return ncclSuccess; } ncclResult_t ncclProxyCreate(struct ncclComm* comm) { - // comm->proxyState.thread is pthread_join()'d by commFree() in init.cc - pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm); - ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev); + /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is + * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */ + struct ncclProxyState* proxyState = comm->proxyState; + if (proxyState->refCount == 1) { + /* we have to make sure all following fields in comm have been initialized. */ + proxyState->tpRank = comm->rank; + proxyState->tpnRanks = comm->nRanks; + proxyState->tpLocalnRanks = comm->localRanks; + proxyState->cudaDev = comm->cudaDev; + proxyState->abortFlag = comm->abortFlag; + proxyState->p2pnChannels = comm->p2pnChannels; + proxyState->p2pChunkSize = comm->p2pChunkSize; + proxyState->nChannels = comm->nChannels; + proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers; + proxyState->dmaBufSupport = comm->dmaBufSupport; + proxyState->ncclNet = comm->ncclNet; + proxyState->ncclCollNet = comm->ncclCollNet; + memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes)); + + pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState); + ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev); + } + return ncclSuccess; +} + +ncclResult_t ncclProxyStop(struct ncclComm* comm) { + if (comm->sharedRes && comm->sharedRes->proxyState) { + struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; + + if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { + if (sharedProxyState->peerAddresses) { + if (*comm->abortFlag == 0) { + struct ncclSocket sock; + int type = ncclProxyMsgStop; + NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); + NCCLCHECK(ncclSocketConnect(&sock)); + NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); + NCCLCHECK(ncclSocketClose(&sock)); + } + } + + if (sharedProxyState->peerSocks) { + int tplocalRanks = comm->sharedRes->tpNLocalRanks; + for (int i = 0; i < tplocalRanks; i++) { + int fd; + NCCLCHECK(ncclSocketGetFd(sharedProxyState->peerSocks + i, &fd)); + if (fd >= 0) { + if (sharedProxyState->proxyOps[i].pool) { + NCCLCHECK(ncclShmClose(sharedProxyState->proxyOps[i].handle)); + } + if (sharedProxyState->sharedDevMems[i]) { + if (!ncclCuMemEnable()) { + CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i])); + } + } + int type = ncclProxyMsgClose; + if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int))); + NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i)); + } + } + } + } + } + return ncclSuccess; } ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { - struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; - if (state == NULL) return ncclSuccess; - if (state->peerAddresses) { - if (*comm->abortFlag == 0) { - struct ncclSocket sock; - int type = ncclProxyMsgStop; - NCCLCHECK(ncclSocketInit(&sock, comm->proxyState.peerAddresses + comm->rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag)); - NCCLCHECK(ncclSocketConnect(&sock)); - NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); - NCCLCHECK(ncclSocketClose(&sock)); - } - free(state->peerAddresses); - } - - if (state->peerSocks) { - for (int i=0; ilocalRanks; i++) { - int fd; - NCCLCHECK(ncclSocketGetFd(state->peerSocks + i, &fd)); - if (fd >= 0) { - if (state->proxyOps[i].pool) { - NCCLCHECK(ncclShmClose(state->proxyOps[i].handle)); - } - if (state->sharedDevMems[i]) { - CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i])); - } - int type = ncclProxyMsgClose; - if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks + i, &type, sizeof(int))); - NCCLCHECK(ncclSocketClose(state->peerSocks + i)); - } - } - free(state->peerSocks); - free(state->proxyOps); - free(state->sharedDevMems); - expectedProxyResponseFree(state); - } + assert(sharedProxyState->refCount == 0); + free(sharedProxyState->peerAddresses); + free(sharedProxyState->peerSocks); + free(sharedProxyState->proxyOps); + free(sharedProxyState->sharedDevMems); + expectedProxyResponseFree(sharedProxyState); + free(sharedProxyState); return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index f1c30fa01b..896082059b 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -22,8 +22,8 @@ template static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) { struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; - struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : - comm->channels[channelId].peers[peer].recv + connIndex; + struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex : + comm->channels[channelId].peers[peer]->recv + connIndex; // handle intra-node network connections int n1 = -1, n2 = -1; if (connIndex == NCCL_CONN_IDX_P2P_NET) { @@ -57,12 +57,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n uint64_t mask = 1UL << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue; comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue; comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask; } return ncclSuccess; @@ -85,7 +85,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); @@ -154,13 +154,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* for (int c=0; cchannels[c].peers[sendPeer].send + connIndex; + struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { + struct ncclDevChannelPeer* addr; conn->connected = 1; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail); + /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ + CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -171,13 +174,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // Start with recv channels TIME_START(4); if (recvMask & (1UL<channels[c].peers[recvPeer].recv + connIndex; + struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { + struct ncclDevChannelPeer* addr; conn->connected = 1; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail); + /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ + CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -203,8 +209,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: - NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream)); + NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); return ret; fail: goto exit; @@ -238,7 +244,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN } // select - struct ncclChannelPeer* root = channel->peers+nranks; + struct ncclChannelPeer* root = channel->peers[nranks]; // connector index: 0 for recv, 1 for send struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); @@ -277,8 +283,9 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN // connect if (isMaster) { NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); - struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks; - struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; + struct ncclDevChannelPeer* devRoot; + CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup); + struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type; CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side @@ -317,16 +324,20 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { // Free collNet resources for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; - struct ncclChannelPeer* peer = channel->peers+comm->nRanks; - for (int b=0; bsend + b; - if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); - send->transportResources = NULL; // avoid double free - } - for (int b=0; brecv + b; - if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); - recv->transportResources = NULL; // avoid double free + struct ncclChannelPeer* peer = channel->peers[comm->nRanks]; + if (peer) { + if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { + for (int b=0; bsend + b; + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); + send->transportResources = NULL; // avoid double free + } + for (int b=0; brecv + b; + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); + recv->transportResources = NULL; // avoid double free + } + } } } return ncclSuccess; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 7a5e012e02..a3eb18f3a3 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -144,24 +144,26 @@ struct setupReq { int netDev; int useGdr; int needFlush; + struct ncclCollNetSharedRes* collNet; }; /* Setup send connector, and return connect information for others in the coll * communicator to connect to me */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct setupReq req; + struct setupReq req = { 0 }; - int proxyRank; + int proxyRank, tpProxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank)); - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank)); + tpProxyRank = comm->topParentRanks[myInfo->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); + ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); + req.collNet = comm->collNetSharedRes; + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks); @@ -169,17 +171,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph } static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct setupReq req; + struct setupReq req = { 0 }; - int proxyRank; + int proxyRank, tpProxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; + // Determine whether we need to flush the GDR buffer on recv or not + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank)); - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank)); + tpProxyRank = comm->topParentRanks[myInfo->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; - NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); + ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); + req.collNet = comm->collNetSharedRes; + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks); @@ -224,7 +231,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; @@ -250,7 +257,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; - NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; @@ -279,7 +286,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) { return ncclSuccess; } -static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; @@ -291,9 +298,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->netDev = req->netDev; resources->useGdr = req->useGdr; ncclNetProperties_t props; - NCCLCHECK(collNetGetProperties(comm, req->netDev, &props)); + NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); + connection->collNet = req->collNet; /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); return ncclSuccess; } @@ -303,19 +311,19 @@ struct sharedResources { int commRefCount[NCCL_MAX_NETDEVS]; }; -ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { - struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; +static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) { + struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources == NULL) { NCCLCHECK(ncclCalloc(&resources, 1)); - comm->proxyState.progressState.collNet.resources = resources; + collNet->resources = resources; } if (resources->collNetComms[netDev] == NULL) - NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev)); + NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev)); return ncclSuccess; } -static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { - struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; +static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) { + struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources->collNetComms[netDev] == NULL) { // Connect to coll comm collNetHandle_t** handlePtrs = NULL; @@ -324,13 +332,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); handlePtrs[i] = &(info->collNetHandle); } - ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank, + ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank, resources->collNetListenComms[netDev], resources->collNetComms+netDev); free(handlePtrs); if (ret == ncclSuccess) { // Close listen comm - NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev])); + NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev])); } else { resources->collNetListenComms[netDev] = NULL; } @@ -340,55 +348,53 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl return ncclSuccess; } -static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) { - struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; +static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) { + struct sharedResources* resources = (struct sharedResources*)collNet->resources; resources->commRefCount[netDev]--; if (resources->commRefCount[netDev] == 0) { - NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev])); + NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev])); } for (int n=0; ncommRefCount[n]) return ncclSuccess; - comm->proxyState.progressState.collNet.resources = NULL; + collNet->resources = NULL; free(resources); return ncclSuccess; } -static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) { - struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; - if (state->size == 0) { - state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; +static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) { + if (collNet->size == 0) { + collNet->size = 2 * collNet->nChannels * collNet->buffSize; } - *size = state->size; + *size = collNet->size; - if (cuda && state->cudaBuff == NULL) { - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, comm->sideStream, cuda)); + if (cuda && collNet->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda)); } - if (!cuda && state->hostBuff == NULL) { - NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); + if (!cuda && collNet->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size)); } - *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; + *gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff; return ncclSuccess; } -static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) { +static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) { // Use different pools for different channels and also separate send/recv. - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; - int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; + int slotSize = collNet->buffSize / NCCL_STEPS; + int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel; *offset = slotSize * globalSlot; return ncclSuccess; } -static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) { - struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; - if (state->size == 0) return ncclSuccess; - CUDACHECK(cudaFree(state->cudaBuff)); - NCCLCHECK(ncclCudaHostFree(state->hostBuff)); +static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) { + if (collNet->size == 0) return ncclSuccess; + NCCLCHECK(ncclCudaFree(collNet->cudaBuff)); + NCCLCHECK(ncclCudaHostFree(collNet->hostBuff)); // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once. - state->size = 0; + collNet->size = 0; return ncclSuccess; } -static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof (struct setupReq)) return ncclInternalError; @@ -401,18 +407,19 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->useGdr = req->useGdr; resources->needFlush = req->needFlush; ncclNetProperties_t props; - NCCLCHECK(collNetGetProperties(comm, req->netDev, &props)); + NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); + connection->collNet = req->collNet; /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; - NCCLCHECK(sharedListen(comm, req->netDev, netHandle)); + NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle)); return ncclSuccess; } -static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); @@ -426,7 +433,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str for (int p=0; precvMhandles[p] = info->mhandles[p]; - NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } @@ -434,7 +441,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } - connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev; + connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev; struct connectMap* map = &resources->map; @@ -445,7 +452,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; @@ -462,7 +469,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; - NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 @@ -470,23 +477,23 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->sendMhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, - &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->sendMhandles[NCCL_PROTO_SIMPLE])); } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } -static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; @@ -494,7 +501,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); resources->collNetRank = args->rank; - NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } @@ -502,7 +509,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } - connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1; + connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1; struct connectMap* map = &resources->map; @@ -513,7 +520,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; @@ -531,7 +538,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; - NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 @@ -539,16 +546,16 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->mhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->mhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, - &resources->mhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->mhandles[NCCL_PROTO_SIMPLE])); } // Pass info to send side @@ -561,41 +568,43 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str return ncclSuccess; } -static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (resources) { for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { if (resources->sendMhandles[p]) { - NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p])); + NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); - NCCLCHECK(sharedBuffersDestroy(comm)); - NCCLCHECK(sharedFree(comm, resources->netDev)); + NCCLCHECK(sharedBuffersDestroy(connection->collNet)); + NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); + if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; } -static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); if (resources) { for (int p=0; pmhandles[p]) { - NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p])); + NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); - NCCLCHECK(sharedBuffersDestroy(comm)); - NCCLCHECK(sharedFree(comm, resources->netDev)); + NCCLCHECK(sharedBuffersDestroy(connection->collNet)); + NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); + if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; @@ -605,7 +614,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct #define LAST_OF_GROUP(s) \ (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) -static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { +static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -633,7 +642,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; int sharedBuffSlot = sub->posted%NCCL_STEPS; int offset; - NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset)); + NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset)); resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize; __sync_synchronize(); volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; @@ -654,7 +663,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int ready = 1; if (s == 0) { int offset; - NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset)); + NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset)); args->sharedBuff[sharedBuffSlot] = localBuff + offset; args->sharedSize[sharedBuffSlot] = args->chunkSize; } @@ -680,7 +689,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype); reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot]; char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot]; - NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); + NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] == NULL) continue; TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]); @@ -696,7 +705,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg int done, size; int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; - NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size)); + NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size)); if (done) { TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size); // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) @@ -720,7 +729,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg return ncclSuccess; } -static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { +static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -751,7 +760,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int sharedBuffSlot = sub->posted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; int offset; - NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); reqFifo[group][buffSlot].recvBuff = localBuff + offset; TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff); sub->posted += args->sliceSteps; @@ -782,8 +791,8 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg } else { int startChannel = group*COLLNET_GROUP_NSUBS; int offset; - NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); - NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); + NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); + NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); } } else { for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; @@ -797,7 +806,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS; int done = 1; - if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL)); + if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot); for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; @@ -811,7 +820,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int sharedBuffSlot = sub->transmitted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; int offset; - NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize; __sync_synchronize(); diff --git a/src/transport/net.cc b/src/transport/net.cc index a8fafcc10f..748be8ca42 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -13,6 +13,7 @@ #include "collectives.h" #include "gdrwrap.h" #include "shm.h" +#include "p2p.h" #include "profiler.h" #include "graph.h" #include "graph/topo.h" @@ -67,10 +68,8 @@ struct connectMapMem{ char* gpuPtr; char* cpuPtr; int size; - union { - char shmPath[PATH_MAX]; - cudaIpcMemHandle_t ipc; - }; + ncclIpcDesc ipcDesc; + char shmPath[PATH_MAX]; ncclShmHandle_t attachHandle; ncclShmHandle_t createHandle; }; @@ -95,9 +94,9 @@ struct sendResources { struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - int rank; - int localRank; - int remoteRank; + int tpRank; + int tpLocalRank; + int tpRemoteRank; int netDev; int useGdr; int useDmaBuf; @@ -122,10 +121,10 @@ struct recvResources { struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - int rank; - int localRank; - int remoteRank; - int proxyRank; + int tpRank; + int tpLocalRank; + int tpRemoteRank; + int tpRemoteProxyRank; int netDev; int useGdr; int useDmaBuf; @@ -162,9 +161,9 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); struct setupReq { - int rank; - int localRank; - int remoteRank; + int tpRank; + int tpLocalRank; + int tpRemoteRank; int shared; int netDev; int useGdr; @@ -177,7 +176,8 @@ struct setupReq { /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct setupReq req; + struct setupReq req = { 0 }; + int localRank, tpProxyRank; send->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -195,20 +195,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph send->conn.curr_hdp_reg = req.curr_hdp_reg; } - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); - req.rank = myInfo->rank; - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); - req.remoteRank = peerInfo->rank; - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); + tpProxyRank = comm->topParentRanks[proxyRank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); + req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpRank = comm->topParentRanks[myInfo->rank]; + req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks); } - *((int*)connectInfo) = proxyRank; + *((int*)connectInfo) = tpProxyRank; return ncclSuccess; } @@ -219,7 +221,8 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); /* Setup recv connector */ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct setupReq req; + struct setupReq req = { 0 }; + int localRank; recv->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -227,7 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.netDev = -1; // Use myInfo->rank as the receiver uses its own NIC - int proxyRank = myInfo->rank; + int proxyRank = myInfo->rank, tpProxyRank; if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev)); if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); @@ -236,13 +239,15 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); // We don't support PXN on receive yet - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); + tpProxyRank = comm->topParentRanks[myInfo->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); - req.rank = myInfo->rank; - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); - req.remoteRank = peerInfo->rank; - NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev, + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); + req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpRank = comm->topParentRanks[myInfo->rank]; + req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks); return ncclSuccess; } @@ -297,39 +302,47 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne send->transportResources = map; opId = send; INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); - NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId)); + NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId)); } else { opId = send; } ncclResult_t ret; - NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId)); + NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId)); if (ret == ncclInProgress) { return ret; } INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId); - if (map->sameProcess) { + if (map->sameProcess && !ncclCuMemEnable()) { if (map->cudaDev != comm->cudaDev) { - // Enable P2P access - cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); - if (err == cudaErrorPeerAccessAlreadyEnabled) { - cudaGetLastError(); - } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); - return ncclInternalError; + if (!ncclCuMemEnable()) { + // Enable P2P access for Legacy IPC + cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); + return ncclInternalError; + } } } - } else { - NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { + if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { - CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess)); + NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, + map->mems[NCCL_NET_MAP_DEVMEM].size, + &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, + (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL; } if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { - void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank; + void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank; if (*sharedDevMemPtr == NULL) { - CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess)); + NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size, + &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc, + sharedDevMemPtr)); } map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr); map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL; @@ -363,13 +376,13 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne opId = recv; INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p", opId, &recv->proxyConn, connectInfo); - NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId)); + NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId)); } else { opId = recv; } ncclResult_t ret; - NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId)); + NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId)); if (ret == ncclInProgress) { return ret; } @@ -394,10 +407,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne static ncclResult_t sendFree(struct ncclConnector* send) { struct connectMap* map = (struct connectMap*)(send->transportResources); if (map) { - if (map->sameProcess == 0) { - NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + if (map->sameProcess && map->cudaDev == cudaDev) { + // Our own GPU, so it wasn't mapped in + free(map); + return ncclSuccess; + } + if (!map->sameProcess || ncclCuMemEnable()) { + if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { - CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + if (ncclCuMemEnable()) { + // cuMem API support + NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); + NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } else { + // Legacy CUDA IPC support + CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } } } free(map); @@ -412,86 +439,87 @@ static ncclResult_t recvFree(struct ncclConnector* recv) { } #define NCCL_SHARED_STEPS 16 -static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess, - int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) { +static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess, + int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) { if (cuda == 0 && sameProcess == 0) { WARN("PXN should not use host buffers for data"); return ncclInternalError; } - struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { - NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; - if (localPeers[localRank] == NULL) { - NCCLCHECK(ncclCalloc(localPeers+localRank, 1)); + if (localPeers[tpLocalRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1)); } - struct ncclProxyPeer* peer = localPeers[localRank]; + struct ncclProxyPeer* peer = localPeers[tpLocalRank]; struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; state->refcount++; if (state->size == 0) { - state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize; + state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize; } if (size) *size = state->size; if (cuda && state->cudaBuff == NULL) { - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, comm->sideStream, cuda)); - if (sameProcess == 0) { - CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff)); + if (sameProcess == 0 || ncclCuMemEnable()) { + NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff)); + } else { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda)); } } if (!cuda && state->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); } if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; - if (sameProcess) { - if (gpuPtr) *gpuPtr = *cpuPtr; - } else { - if (gpuPtr) *gpuPtr = NULL; - if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t)); - } + if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL; + if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc)); return ncclSuccess; } -static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) { +static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) { // Use different pools for different channels and also separate send/recv. int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; - *offset = comm->p2pChunkSize * globalSlot; + *offset = proxyState->p2pChunkSize * globalSlot; return ncclSuccess; } -static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) { - if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); - struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank]; +static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { + if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); + struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank]; if (peer == NULL) NCCLCHECK(ncclInternalError;) struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; if (state->size == 0) NCCLCHECK(ncclInternalError); - state->refcount--; - if (state->refcount == 0) { - if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff)); + if (ncclAtomicRefCountDecrement(&state->refcount) == 0) { + if (state->cudaBuff) { + if (!connection->sameProcess || ncclCuMemEnable()) { + NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc)); + } + NCCLCHECK(ncclCudaFree(state->cudaBuff)); + } if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); } + if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; + free(peer); - comm->proxyState.progressState.localPeers[localRank] = NULL; - for (int r=0; rlocalRanks; r++) { - if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess; + proxyState->progressState.localPeers[tpLocalRank] = NULL; + for (int r = 0; r < proxyState->tpLocalnRanks; r++) { + if (proxyState->progressState.localPeers[r]) return ncclSuccess; } // All peers are freed, free array - free(comm->proxyState.progressState.localPeers); - comm->proxyState.progressState.localPeers = NULL; + free(proxyState->progressState.localPeers); + proxyState->progressState.localPeers = NULL; return ncclSuccess; } -static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) { - int rank = comm->localRankToRank[connection->localRank]; - int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; - NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL)); +static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) { + NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL)); return ncclSuccess; } -static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; @@ -499,9 +527,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; - resources->rank = req->rank; - resources->localRank = req->localRank; - resources->remoteRank = req->remoteRank; + resources->tpRank = req->tpRank; + resources->tpLocalRank = req->tpLocalRank; + resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; @@ -509,9 +537,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->connIndex = req->connIndex; resources->curr_hdp_reg = req->curr_hdp_reg; ncclNetProperties_t props; - NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); + NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; // We don't return any data @@ -520,7 +548,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc return ncclSuccess; } -static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; @@ -528,9 +556,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; - resources->rank = req->rank; - resources->localRank = req->localRank; - resources->remoteRank = req->remoteRank; + resources->tpRank = req->tpRank; + resources->tpLocalRank = req->tpLocalRank; + resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; @@ -538,50 +566,50 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; - NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props)); + NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; - NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm)); + NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; } -static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; ncclResult_t ret = ncclSuccess; if (resources->shared) { // Shared buffers - struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { - NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; - if (localPeers[resources->localRank] == NULL) { - NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + if (localPeers[resources->tpLocalRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } - connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId; + connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { - NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } - struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId); + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; + if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm); + ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); } } else { // Connect to remote peer - ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm); + ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -594,28 +622,27 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str // Create structures struct connectMap* map = &resources->map; - map->sameProcess = - comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + map->sameProcess = connection->sameProcess; map->shared = resources->shared; CUDACHECK(cudaGetDevice(&map->cudaDev)); if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, comm->buffSizes[p], buffs[p]); - resources->buffSizes[p] = comm->buffSizes[p]; + NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]); + resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit( - comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels, - &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc)); + proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels, + &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; - if (comm->allocP2pNetLLBuffers) { - NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); - resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL]; + if (proxyState->allocP2pNetLLBuffers) { + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); @@ -626,15 +653,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { - if (!map->sameProcess) { + if (!map->sameProcess || ncclCuMemEnable()) { ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); + NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, + (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } else { + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr)); } - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr)); map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } - if (!map->sameProcess) { - CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); - } } if (map->sameProcess) { NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); @@ -644,7 +671,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; @@ -669,24 +696,24 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #else /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) { + if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) { int dmabuf_fd; uint64_t offset; CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset)); - NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld", (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } } } @@ -697,40 +724,40 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str return ncclSuccess; } -static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; struct recvResources* resources = (struct recvResources*)(connection->transportResources); - resources->proxyRank = *(int*)reqBuff; + resources->tpRemoteProxyRank = *(int*)reqBuff; ncclResult_t ret = ncclSuccess; // Finish connection establishment from remote peer if (resources->shared) { // Shared buffers - struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { - NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; - if (localPeers[resources->localRank] == NULL) { - NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + if (localPeers[resources->tpLocalRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } - connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId; + connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { - NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } - struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank; - if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId); + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; + if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { - ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm); + ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); } } else { // Connect to remote peer - ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm); + ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -741,26 +768,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str } *done = 1; - NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm)); + NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm)); // Create structures struct connectMap* map = &resources->map; - map->sameProcess = - comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + map->sameProcess = connection->sameProcess; if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv map->shared = resources->shared; if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, comm->buffSizes[p], buffs[p]); - resources->buffSizes[p] = comm->buffSizes[p]; + NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]); + resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit( - comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels, + proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); @@ -769,14 +795,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); - if (comm->allocP2pNetLLBuffers) { - NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); - resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL]; + if (proxyState->allocP2pNetLLBuffers) { + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr)); + if (ncclCuMemEnable()) { + NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, + (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } else { + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr)); + } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } @@ -784,7 +815,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && map->sameProcess) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; @@ -807,24 +838,24 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #else /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) { + if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) { int dmabuf_fd; uint64_t offset; CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset)); - NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld", (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } } } @@ -835,17 +866,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str return ncclSuccess; } -static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect - NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0)); + NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { - NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -854,19 +885,25 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct } else { NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle)); } - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (!resources->map.sameProcess || ncclCuMemEnable()) { + // cuMem API support + if (mems[NCCL_NET_MAP_DEVMEM].size) { + NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); + } + } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { - NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0)); + NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { - struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank; + struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank; comms->sendRefCount[resources->channelId]--; - if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId])); + if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId])); } else { - NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm)); + NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } else { - NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm)); + NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } @@ -874,37 +911,43 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct return ncclSuccess; } -static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect - NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1)); + NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { - NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p])); + NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); - CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (!resources->map.sameProcess || ncclCuMemEnable()) { + // cuMem API support + if (mems[NCCL_NET_MAP_DEVMEM].size) { + NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); + } + } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { - NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1)); + NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { - struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank; + struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank; comms->recvRefCount[resources->channelId]--; - if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId])); + if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId])); } else { - NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm)); + NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } else { - NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm)); + NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } - + if (resources) free(resources); return ncclSuccess; } @@ -915,12 +958,10 @@ static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to c static int g_npkit_net_poll_cnt = 0; #endif -static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { - +static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt++; #endif - if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -952,7 +993,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg if (resources->shared) { int sharedBuffSlot = sub->posted%maxDepth; int offset; - NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset)); + NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset)); resources->recvMem->offsFifo[buffSlot] = offset; __sync_synchronize(); volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; @@ -1010,7 +1051,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg *resources->curr_hdp_reg = 1; } // Data is ready, try to send. - NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); + NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) @@ -1044,7 +1085,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg if (sub->done < sub->transmitted) { int done; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; - NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL)); + NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) @@ -1086,12 +1127,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg return ncclSuccess; } -static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { - +static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt++; #endif - if (args->state == ncclProxyOpReady) { // Initialize subs and group them by same recvComm. void* recvComm; @@ -1151,7 +1190,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg if (p == NCCL_PROTO_SIMPLE && resources->shared) { int sharedBuffSlot = sub->posted%maxDepth; int offset; - NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); + NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; offsFifo[buffSlot] = offset; ptrs[subCount] = localBuff+offset; @@ -1160,7 +1199,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg } sizes[subCount] = stepSize*args->sliceSteps; if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; - tags[subCount] = resources->remoteRank; + tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = resources->mhandles[p]; subCount++; } @@ -1169,7 +1208,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg uint64_t step = subGroup->posted; struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); - NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; @@ -1207,7 +1246,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg int sizes[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; irequests[step%NCCL_STEPS], &done, sizes)); + NCCLCHECK(proxyState->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes)); if (done) { int needFlush = 0; int totalSize = 0; @@ -1264,7 +1303,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg } } struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); - NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); + NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } args->idle = 0; @@ -1279,7 +1318,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg uint64_t step = subGroup->transmitted; int done = 1; void* request = subGroup->requests[step%NCCL_STEPS]; - if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL)); + if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL)); if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index a01f391133..195372e81e 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -106,6 +106,7 @@ static void* ncclIbAsyncThreadMain(void* args) { } NCCL_PARAM(IbDisable, "IB_DISABLE", 0); +NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1); static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) { char devicePath[PATH_MAX]; @@ -117,7 +118,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; // Also merge virtual functions (VF) into the same device - p[strlen(p)-3] = '0'; + if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // And keep the real port aside (the ibv port is always 1 on recent cards) *realPort = 0; for (int d=0; dgidInfo.link_layer = portAttr.link_layer; if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q=0; qnqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); } else { // RoCE - union ibv_gid gid; - NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); - qpInfo.spn = gid.global.subnet_prefix; - qpInfo.iid = gid.global.interface_id; + NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid)); + qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix; + qpInfo.iid = comm->gidInfo.localGid.global.interface_id; for (int q=0; qnqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } @@ -716,6 +729,8 @@ ib_connect: memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo)); + comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; + comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; for (int q=0; qnqps; q++) { struct ibv_qp* qp = comm->qps[q]; NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); @@ -777,6 +792,9 @@ ib_recv: /* copy back the received info */ memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); + rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; + rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; + // IB setup struct ibv_context* ctx; uint8_t ib_port; @@ -784,8 +802,7 @@ ib_recv: ib_port = ncclIbDevs[lComm->dev].port; struct ibv_port_attr portAttr; NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); - union ibv_gid gid; - NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); + NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid)); // QP Creation NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs)); @@ -812,7 +829,8 @@ ib_recv: if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; // Allocate Flush dummy buffer for GPU Direct RDMA - rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0; + rComm->gpuFlush.enabled = ((ncclIbGdrSupport(lComm->dev) == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) + && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0; if (rComm->gpuFlush.enabled) { NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem; @@ -823,8 +841,8 @@ ib_recv: localQpInfo.lid=portAttr.lid; localQpInfo.link_layer=portAttr.link_layer; localQpInfo.ib_port=ib_port; - localQpInfo.spn=gid.global.subnet_prefix; - localQpInfo.iid=gid.global.interface_id; + localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; + localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id; localQpInfo.mtu=portAttr.active_mtu; NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo)); NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp)); @@ -833,11 +851,11 @@ ib_recv: // Fill Handle struct ncclIbQpInfo qpInfo; qpInfo.lid=portAttr.lid; - qpInfo.link_layer=portAttr.link_layer; + qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer; qpInfo.ib_port=ib_port; for (int q=0; qnqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num; - qpInfo.spn=gid.global.subnet_prefix; - qpInfo.iid=gid.global.interface_id; + qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; + qpInfo.iid=rComm->gidInfo.localGid.global.interface_id; qpInfo.mtu=remQpInfo.mtu; stage->state = ncclIbCommStateSend; @@ -875,6 +893,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** r->verbs = verbs; r->events = 1; r->sock = NULL; + r->gidInfo = NULL; *req = r; return ncclSuccess; } @@ -979,6 +998,8 @@ returning: return res; } +NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1); + ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; @@ -1034,9 +1055,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work const int align = 128; - for (int q=0; qnqps; q++) { + const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; + for (int q=0; qsend.size, comm->nqps), align) * align; + int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align; int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); if (length <= 0) { comm->wrs[r].sg_list = NULL; @@ -1048,10 +1070,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { } } struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr)); + NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr)); + comm->qpIndex = (comm->qpIndex+1)%comm->nqps; for (int r=0; rsend.size, comm->nqps), align) * align; + int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align; reqs[r]->send.offset += chunkSize; comm->sges[r].addr += chunkSize; comm->wrs[r].wr.rdma.remote_addr += chunkSize; @@ -1111,7 +1134,8 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh req->send.data = data; req->send.lkey = mr->lkey; req->send.offset = 0; - req->events = comm->nqps; + req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; + if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; *request = reqs[r] = req; // If this is a multi-recv, send only when all requests have matched. @@ -1205,6 +1229,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta req->type = NCCL_NET_IB_REQ_RECV; req->sock = &comm->sock; req->nreqs = n; + if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; for (int i=0; irecv.sizes[i] = 0; struct ibv_recv_wr wr; @@ -1215,13 +1240,15 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta wr.num_sge = 0; TIME_START(1); - for (int q=0; qnqps; q++) { - struct ibv_qp* qp = comm->qps[q]; + const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; + for (int q=0; qqps[comm->qpIndex]; struct ibv_recv_wr* bad_wr; NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr)); + comm->qpIndex = (comm->qpIndex+1)%comm->nqps; } TIME_STOP(1); - req->events = comm->nqps; + req->events = nqps; *request = req; @@ -1292,8 +1319,16 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { char line[SOCKET_NAME_MAXLEN+1]; union ncclSocketAddress addr; ncclSocketGetAddr(r->sock, &addr); - WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d", - ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + char localGidString[INET6_ADDRSTRLEN] = ""; + char remoteGidString[INET6_ADDRSTRLEN] = ""; + const char* localGidStr = NULL, *remoteGidStr = NULL; + if (r->gidInfo) { + localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString)); + remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString)); + } + WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s", + ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], + localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString); return ncclRemoteError; } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 336877ce2b..633cb04d8e 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -43,22 +43,7 @@ struct ncclTransport nvlsTransport = { { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL } }; -#define NVLS_HANDLE_SIZE 64 - -struct nvlsResources { - CUmulticastObjectProp properties; - CUmemAccessDesc accessDesc; - int dev; - size_t size; - size_t granularity; - CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer - char* mcBuff; // Multicast NVLS buffer address - CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer - char* ucBuff; // Unicast NVLS buffer address -}; - - -ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) { +ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) { CUmulticastObjectProp* prop = &resources->properties; memset(prop, 0, sizeof(*prop)); prop->size = size; @@ -81,7 +66,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* reso return ncclSuccess; } -ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) { +ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) { size_t size = resources->size; // Create a Multicast group @@ -103,24 +88,13 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resour return ncclSuccess; } -ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) { +ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev); CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev)); return ncclSuccess; } -ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) { - int dev = resources->dev; - size_t size = resources->size; - INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev); - - // Unbind physical memory from group for the given device - CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size)); - - return ncclSuccess; -} - -ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) { +ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) { CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank); @@ -131,9 +105,11 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou int fd = *(int *)shareableHandle; TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd); struct ncclProxyConnector proxyConn; - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn)); + int tpProxyRank = comm->topParentRanks[rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn)); TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank); - NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int))); + NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle)); + fd = *(int *)shareableHandle; TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank); CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type)); } else { @@ -146,7 +122,20 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou return ncclSuccess; } -ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) { +ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { + CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; + + // Import and map the remote memory descriptor to the local GPU + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + // cuMem UDS support + int fd = *(int *)resources->shareableHandle; + (void) close(fd); + } + + return ncclSuccess; +} + +ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size = resources->size; size_t granularity; CUdeviceptr ptr = 0; @@ -178,7 +167,21 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resou return ncclSuccess; } -ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) { +ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { + int dev = resources->dev; + size_t size = resources->size; + INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev); + + // Unbind physical memory from group for the given device + CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size)); + + // Release the MC group resources + NCCLCHECK(nvlsGroupDisconnect(comm, resources)); + + return ncclSuccess; +} + +ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size = resources->size; CUdeviceptr ptr = 0; @@ -196,7 +199,7 @@ ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resour return ncclSuccess; } -ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) { +ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size; CUdeviceptr ptr; INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", @@ -224,135 +227,172 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* reso #define NVLS_MEM_ALIGN_SIZE (1 << 21) +NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16); -NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1); +ncclResult_t ncclNvlsInit(struct ncclComm* comm) { + comm->nvlsSupport = 0; + comm->nvlsChannels = 0; + + int gpuCount; + NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount)); + if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess; -ncclResult_t ncclNvlsSetup(struct ncclComm* comm) { - if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess; CUdevice dev; int driverVersion; + if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess; - CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); + CUCHECK(cuCtxGetDevice(&dev)); CUDACHECK(cudaDriverGetVersion(&driverVersion)); - comm->nvlsSupport = 0; - // NVLS Multicast support requires CUDA12.1 UMD + KMD - if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) { - CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); - } - INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); - if (comm->nvlsSupport == 0) return ncclSuccess; - - int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels())); - int rank = comm->localRank, nranks = comm->localRanks; - - for (int c=0; cnvlsResources = resources; - - size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; - size_t memSize = NVLS_MEM_ALIGN_SIZE; - size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize); - size_t nvlsTotalSize = nvlsPerRankSize*nranks; - - INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi", - comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize); - - char* nvlsShareableHandle = NULL; - NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup); - NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup); - if (rank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup); + if (ncclParamNvlsEnable() == 2) { + // NVLS Multicast support requires CUDA12.1 UMD + KMD + if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) { + CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); + } } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup); - NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup); + comm->nvlsSupport = 1; } - NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup); - NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup); - // Local intra-node barrier to ensure everyone has bound their memory to the group - NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup); - NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup); + INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); + if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels())); + return ncclSuccess; +} - for (int c=0; cchannels+c; - channel->nvls.nHeads = nranks; - for (int i=0; invls.up[i] = -1; - channel->nvls.down = comm->nRanks+1+comm->localRank; - channel->nvls.out = -1; // Network not yet implemented. - channel->nvls.headRank = comm->localRank; // Network not yet implemented. - } +ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { + if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; - for (int r=0; rnRanks+1+r; - for (int c=0; cchannels+c; - channel->nvls.up[r] = nvlsPeer; + int nHeads = comm->channels[0].nvls.nHeads; + int headRank = comm->channels[0].nvls.headRank; - char* mem = NULL; - struct ncclChannelPeer* peer = channel->peers+nvlsPeer; + CUdevice dev; + CUCHECK(cuCtxGetDevice(&dev)); - // Reduce UC -> MC - mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize); - peer->send[0].transportComm = &nvlsTransport.send; - peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->send[0].conn.head = (uint64_t*)(mem+buffSize); - peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2); - mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize); - peer->recv[1].transportComm = &nvlsTransport.recv; - peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->recv[1].conn.head = (uint64_t*)(mem+buffSize); - peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2); - peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL; + ncclResult_t res = ncclSuccess; + bool nvlsShare = true; + if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks) + nvlsShare = true; + else + nvlsShare = false; - // Broadcast MC -> UC - mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize); - peer->recv[0].transportComm = &nvlsTransport.recv; - peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->recv[0].conn.head = (uint64_t*)(mem+buffSize); - peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2); - mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize); - peer->send[1].transportComm = &nvlsTransport.send; - peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->send[1].conn.head = (uint64_t*)(mem+buffSize); - peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2); - peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL; + if (nvlsShare) { + /* reuse NVLS resources */ + comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); + for (int c = 0; c < comm->nvlsChannels; c++) { + NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup); + } - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup); + comm->nvlsResources = parent->nvlsResources; + ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount); + } else { + int nChannels; + struct ncclNvlsSharedRes* resources; - /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p", - nvlsPeer, c, - resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize), - resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize), - resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize), - resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/ + NCCLCHECK(ncclCalloc(&resources, 1)); + comm->nvlsResources = resources; + resources->refCount = 1; + + if (parent && parent->config.splitShare) { + /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels + * to make sure nvlsChannels match for each rank. */ + comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); + } + + nChannels = resources->nChannels = comm->nvlsChannels; + for (int c = 0; c < nChannels; c++) { + NCCLCHECK(initNvlsChannel(comm, c, parent, false)); + } + + size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; + size_t memSize = NVLS_MEM_ALIGN_SIZE; + size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize); + size_t nvlsTotalSize = nvlsPerRankSize * nHeads; + + INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi", + comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize); + + char* shareableHandle = resources->shareableHandle; + NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup); + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); + NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup); + } + + NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup); + NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup); + // Local intra-node barrier to ensure everyone has bound their memory to the group + NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup); + NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup); + + for (int h = 0; h < nHeads; h++) { + int nvlsPeer = comm->nRanks + 1 + h; + for (int c = 0; c < nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + char* mem = NULL; + struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; + + // Reduce UC -> MC + mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize); + peer->send[1].transportComm = &nvlsTransport.send; + peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; + peer->send[1].conn.head = (uint64_t*)(mem + buffSize); + peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize); + peer->recv[0].transportComm = &nvlsTransport.recv; + peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; + peer->recv[0].conn.head = (uint64_t*)(mem + buffSize); + peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL; + + // Broadcast MC -> UC + mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); + peer->recv[1].transportComm = &nvlsTransport.recv; + peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; + peer->recv[1].conn.head = (uint64_t*)(mem + buffSize); + peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); + peer->send[0].transportComm = &nvlsTransport.send; + peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; + peer->send[0].conn.head = (uint64_t*)(mem + buffSize); + peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; + + struct ncclDevChannelPeer* addr; + CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); + CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); + + /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p", + nvlsPeer, c, + resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize), + resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize), + resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize), + resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/ + } } } - free(nvlsShareableHandle); return res; cleanup: comm->nvlsSupport = 0; - free(nvlsShareableHandle); return res; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { - struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources; + struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources; if (resources == NULL) return ncclSuccess; - NCCLCHECK(nvlsGroupUnbind(comm, resources)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources)); - free(resources); - comm->nvlsResources = NULL; + + if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) { + NCCLCHECK(nvlsGroupUnbind(comm, resources)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources)); + free(resources); + comm->nvlsResources = NULL; + } return ncclSuccess; } @@ -362,7 +402,12 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { * Pre CUDA 12.1 stubs */ -ncclResult_t ncclNvlsSetup(struct ncclComm* comm) { +ncclResult_t ncclNvlsInit(struct ncclComm* comm) { + comm->nvlsChannels = 0; + return ncclSuccess; +} + +ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index d73d451e01..460b4bf4e9 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -11,17 +11,21 @@ #include "shm.h" #include "graph.h" #include "graph/topo.h" +#include "p2p.h" + +enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; struct ncclP2pBuff { void* directPtr; - cudaIpcMemHandle_t devIpc; + size_t size; + ncclIpcDesc ipcDesc; }; struct p2pConnectInfo { int rank; int read; struct ncclP2pBuff p2pBuff; - // Use by CE memcpy + // Used by CE memcpy char shmName[7]; int shmSize; }; @@ -31,7 +35,7 @@ struct p2pShm { struct ncclSendMem sendMem; struct ncclRecvMem recvMem; }; -struct p2pProxyInfo { +struct p2pShmProxyInfo { // Shared memory between proxy and receiving GPU struct p2pShm* shm; struct p2pShm* devShm; @@ -46,29 +50,33 @@ struct p2pProxyInfo { // Receiver buffer char* recvFifo; - // Used by progress only + // Used by CE memcpy progress only uint64_t step; cudaStream_t stream; cudaEvent_t events[NCCL_STEPS]; }; static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); -struct p2pSendResources { - struct ncclSendMem* devMem; - uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) - void* sendMemIpc; - void* recvMemIpc; - struct p2pProxyInfo proxyInfo; -}; - -struct p2pRecvResources { - struct ncclRecvMem* devMem; +struct p2pResources { + enum p2pType type; + union { + struct ncclSendMem* sendDevMem; + struct ncclRecvMem* recvDevMem; + }; void* sendMemIpc; void* recvMemIpc; + // CE memcpy support + struct p2pShmProxyInfo proxyInfo; struct p2pShm* shm; struct p2pShm* devShm; int shmSize; ncclShmHandle_t handle; + uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) +}; + +// cuMem API support +struct p2pCuMemProxyInfo { + struct ncclP2pBuff p2pBuff; }; #include @@ -90,6 +98,7 @@ static int busIdToCudaDev(int64_t busId) { return -1; } +// CE memcpy support NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0); static int useMemcpy = 0; static void initCeOperation(); @@ -149,14 +158,11 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop *ret = 0; return ncclSuccess; } - if (p2p == 0 && cudaDev1 == cudaDev2 && info1->busId == info2->busId) { - p2p = 1; - } #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else - // Check that legacy IPC support is available - if (p2p != 0) { + // This will always fail when using NCCL_CUMEM_ENABLE=1 + if (p2p != 0 && !ncclCuMemEnable()) { // Cached result of the legacyIPC detection static int legacyIPC = -1; if (legacyIPC >= 0) { @@ -166,12 +172,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop // Check that legacy IPC support is available (WSL WAR) char *dummy; cudaIpcMemHandle_t ipc; - NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN)); + NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN)); if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) { INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported"); *ret = 0; } - CUDACHECK(cudaFree(dummy)); + NCCLCHECK(ncclCudaFree(dummy)); legacyIPC = *ret; return ncclSuccess; } @@ -193,6 +199,98 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) +// cuMem API support +ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) { + if (ncclCuMemEnable()) { +#if CUDART_VERSION >= 11030 + // cuMem API support + CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; + CUmemGenericAllocationHandle handle; + + NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); + CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); +#else + return ncclInternalError; +#endif + } else { + // Allocate a CUDA buffer and generate an IPC handle for it + NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true)); + cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr); + if (res != cudaSuccess) { + WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); + ncclCudaFree(*ptr); + CUDACHECK(res); + } + } + INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc); + + return ncclSuccess; +} + +ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { + if (ncclCuMemEnable()) { + // cuMem API support + CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; + + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + int fd = *(int *) &ipcDesc->cuDesc.data; + if (fd <= 0) return ncclInternalError; + (void) close(fd); + } + } + + return ncclSuccess; +} + +ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) { + if (ncclCuMemEnable()) { +#if CUDART_VERSION >= 11030 + // cuMem API support + CUdeviceptr dptr = 0; + CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; + CUmemGenericAllocationHandle handle; + ncclCuDesc *cuDesc = &ipcDesc->cuDesc; + + // Import and map the remote memory descriptor to the local GPU + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + // UDS fd support + struct ncclProxyConnector proxyConn; + int fd = *(int *)(&cuDesc->data); + int newFd = -1; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn)); + NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd)); + INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer); + CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type)); + close(newFd); + } else { + CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type)); + } + CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); + CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0)); + + TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr); + + // Allow access by the local GPU + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = comm->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); + TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id); + + *devMemPtr = (void *)dptr; +#else + return ncclInternalError; +#endif + } else { + // Legacy CUDA IPC + CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess)); + } + + INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr); + + return ncclSuccess; +} // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); @@ -209,10 +307,11 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* return ncclSuccess; } -static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { - if (myInfo->pidHash == peerInfo->pidHash) { +static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { + if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { - // Enable P2P access + // Same PID different GPUs, enable P2P access + // Legacy CUDA IPC cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); @@ -225,8 +324,15 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { - CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess)); - *ipcPtr = *devMem; + if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) { + // Same PID and GPU + *devMem = p2pBuff->directPtr; + *ipcPtr = NULL; + } else { + // Different PID or different GPU + NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem)); + *ipcPtr = *devMem; + } } return ncclSuccess; } @@ -234,7 +340,8 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct p2pSendResources* resources; + struct p2pResources* resources; + int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; @@ -261,35 +368,47 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st int sendSize = sizeof(struct ncclSendMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; + if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { - if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + resources->type = P2P_DIRECT; + send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks); } else { + // cuMem API support + if (ncclCuMemEnable()) { + resources->type = P2P_CUMEM; + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s comm %p nRanks %02d", + channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);; + } else { + // Legacy CUDA IPC + resources->type = P2P_IPC; + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d", + channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks); + } send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d", - channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks); } } else { + resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, - comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks); + comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks); } - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); + tpProxyRank = comm->topParentRanks[info->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn)); if (useMemcpy) { - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo))); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo))); info->shmSize = resources->proxyInfo.shmSize; memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); } else { - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); } return ncclSuccess; @@ -298,7 +417,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { - struct p2pRecvResources* resources; + struct p2pResources* resources; + int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; @@ -312,44 +432,56 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st int recvSize = sizeof(struct ncclRecvMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - for (int p=0; pread && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p]; + for (int p=0; pread && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) { - if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + resources->type = P2P_DIRECT; + recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { + if (ncclCuMemEnable()) { + // cuMem API support + resources->type = P2P_CUMEM; + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM", + channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + } else { + // Legacy CUDA IPC + resources->type = P2P_IPC; + } recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { + resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; } - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn)); - NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); + tpProxyRank = comm->topParentRanks[info->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn)); + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc)); + NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); return ncclSuccess; } /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { - struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; - struct ncclRecvMem* remDevMem; + struct p2pResources* resources = (struct p2pResources*)send->transportResources; + struct ncclRecvMem* remDevMem = NULL; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); + NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); char* buff = (char*)(remDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ - if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy - send->conn.buffs[p] = (char*)(resources->devMem+1); + if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy + send->conn.buffs[p] = (char*)(resources->sendDevMem+1); } else { send->conn.buffs[p] = buff; - buff += send->comm->buffSizes[p]; + buff += comm->buffSizes[p]; } } @@ -358,20 +490,20 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo; send->conn.head = &resources->proxyInfo.devShm->sendMem.head; // Send SIMPLE buff to proxy, and replace it by local buffer - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff; } else { send->conn.tail = &remDevMem->tail; - send->conn.head = &resources->devMem->head; - send->conn.ptrExchange = &resources->devMem->ptrExchange; - send->conn.redOpArgExchange = resources->devMem->redOpArgExchange; + send->conn.head = &resources->sendDevMem->head; + send->conn.ptrExchange = &resources->sendDevMem->ptrExchange; + send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange; } return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { - struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; + struct p2pResources* resources = (struct p2pResources*)recv->transportResources; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; struct ncclSendMem* remDevMem = NULL; @@ -381,20 +513,22 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); resources->shmSize = info->shmSize; + // Attach to peer's SHM segment NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; } else { - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); + NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); - recv->conn.tail = &resources->devMem->tail; + struct ncclRecvMem* devMem = resources->recvDevMem; + recv->conn.tail = &devMem->tail; recv->conn.head = &remDevMem->head; recv->conn.ptrExchange = &remDevMem->ptrExchange; recv->conn.redOpArgExchange = remDevMem->redOpArgExchange; } - char* buff = (char*)(resources->devMem+1); + char* buff = (char*)(resources->recvDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy @@ -402,93 +536,113 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn recv->conn.buffs[p] = (char*)(remDevMem+1); } else { recv->conn.buffs[p] = buff; - buff += recv->comm->buffSizes[p]; + buff += comm->buffSizes[p]; } } return ncclSuccess; } ncclResult_t p2pSendFree(struct ncclConnector* send) { - struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; + struct p2pResources* resources = (struct p2pResources*)send->transportResources; if (resources) { - if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); - if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + if (ncclCuMemEnable()) { + // cuMem API support + if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); + if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + } + else { + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + } free(resources); } return ncclSuccess; } ncclResult_t p2pRecvFree(struct ncclConnector* recv) { - struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; + struct p2pResources* resources = (struct p2pResources*)recv->transportResources; if (resources) { - if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); - if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); - if (useMemcpy) { - NCCLCHECK(ncclShmClose(resources->handle)); + if (ncclCuMemEnable()) { + // cuMem API support + if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); + if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); + } + else { + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + if (useMemcpy) { + NCCLCHECK(ncclShmClose(resources->handle)); + } } free(resources); } return ncclSuccess; } -static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (useMemcpy) { - struct p2pProxyInfo* proxyInfo; + // CE memcpy support + struct p2pShmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); connection->transportResources = proxyInfo; - NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream, true)); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true)); char shmPath[PATH_MAX]; shmPath[0] = '\0'; proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); + // Create a SHM segment for the peer to attach to NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize); memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); - if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError; - memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo)); + if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError; + memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); } else { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; - NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true)); - connection->transportResources = p2pBuff->directPtr; - cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr); - if (res != cudaSuccess) { - WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); - cudaFree(p2pBuff->directPtr); - free(p2pBuff); - CUDACHECK(res); + NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); + p2pBuff->size = size; + if (ncclCuMemEnable()) { + // cuMem API support + struct p2pCuMemProxyInfo* proxyInfo; + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); + connection->transportResources = proxyInfo; + } else { + connection->transportResources = p2pBuff->directPtr; } } *done = 1; return ncclSuccess; } -static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; - NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true)); - connection->transportResources = p2pBuff->directPtr; - cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr); - if (res != cudaSuccess) { - WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); - cudaFree(p2pBuff->directPtr); - free(p2pBuff); - CUDACHECK(res); + NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); + p2pBuff->size = size; + if (ncclCuMemEnable()) { + // cuMem API support + struct p2pCuMemProxyInfo* proxyInfo; + NCCLCHECK(ncclCalloc(&proxyInfo, 1)); + memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); + connection->transportResources = proxyInfo; + } else { + connection->transportResources = p2pBuff->directPtr; } *done = 1; return ncclSuccess; } -static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { - struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources; +static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (reqSize != sizeof(void*)) return ncclInternalError; proxyInfo->recvFifo = *((char**)reqBuff); @@ -501,13 +655,14 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, return ncclSuccess; } -static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { + // CE memcpy support if (useMemcpy) { - struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources; + struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (proxyInfo) { NCCLCHECK(ncclShmClose(proxyInfo->handle)); NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); - CUDACHECK(cudaFree(proxyInfo->ceDevBuff)); + NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff)); CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); for (int i=0; ievents[i])); @@ -515,23 +670,45 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str free(proxyInfo); } } else { - // Do not check return code as CUDA may have already shut down - cudaFree(connection->transportResources); + if (ncclCuMemEnable()) { + // cuMem API support + struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; + if (proxyInfo) { + struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; + ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); + ncclCudaFree(p2pBuff->directPtr); + free(proxyInfo); + } + } else { + // Do not check return code as CUDA may have already shut down + ncclCudaFree(connection->transportResources); + } } return ncclSuccess; } -static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { - // Do not check return code as CUDA may have already shut down - cudaFree(connection->transportResources); +static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { + if (ncclCuMemEnable()) { + struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; + if (proxyInfo) { + struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; + ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); + ncclCudaFree(p2pBuff->directPtr); + free(proxyInfo); + } + } else { + // Do not check return code as CUDA may have already shut down + ncclCudaFree(connection->transportResources); + } return ncclSuccess; } -static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { +// CE memcpy support +static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources); + struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; @@ -541,10 +718,10 @@ static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxy args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = comm->buffSizes[p] / NCCL_STEPS; + int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources); + struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index e125df2c2f..dbabf66725 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -85,7 +85,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr shmPath[0] = '\0'; int shmSize = sizeof(struct ncclSendMem); if (shmLocality == SHM_SEND_SIDE) { - for (int p=0; pcomm->buffSizes[p]; + for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); @@ -108,7 +108,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr shmPath[0] = '\0'; int shmSize = sizeof(struct ncclRecvMem); if (shmLocality == SHM_RECV_SIDE) { - for (int p=0; pcomm->buffSizes[p]; + for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); @@ -146,7 +146,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; - buff += send->comm->buffSizes[p]; + buff += comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; send->conn.head = &resources->devHostMem->head; @@ -155,9 +155,11 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co send->conn.sizesFifo = resources->devRemHostMem->sizesFifo; } if (useMemcpySend) { - NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn)); + int tpProxyRank; + tpProxyRank = comm->topParentRanks[comm->rank]; + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; - NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); + NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; send->conn.tail = &proxyInfo.ceRecvMem->tail; send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo; @@ -179,7 +181,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; - buff += recv->comm->buffSizes[p]; + buff += comm->buffSizes[p]; } recv->conn.head = &resources->devRemHostMem->head; recv->conn.tail = &resources->devHostMem->tail; @@ -187,7 +189,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co if (useMemcpyRecv) { NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; - NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); + NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; recv->conn.tail = &proxyInfo.ceRecvMem->tail; } @@ -214,12 +216,12 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) { return ncclSuccess; } -static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { +static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(proxyInfo, reqBuff, reqSize); - NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream)); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; idevFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream)); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; itransportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); - CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); @@ -265,12 +267,12 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str return ncclSuccess; } -static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { +static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); - CUDACHECK(cudaFree(resources->devFifo)); + NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); @@ -280,7 +282,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str return ncclSuccess; } -static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { +static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -294,7 +296,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = comm->buffSizes[p] / NCCL_STEPS; + int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); @@ -339,7 +341,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy return ncclSuccess; } -static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { +static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -353,7 +355,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxy args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = comm->buffSizes[p] / NCCL_STEPS; + int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index a45f8bdedb..54356d0f0f 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -196,7 +196,7 @@ namespace RcclUnitTesting if (useMultiRankPerGpu) { - if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess) + //if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess) { ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId); status = TEST_FAIL; diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile index 4a51a427ce..7446d7b52a 100644 --- a/tools/topo_expl/Makefile +++ b/tools/topo_expl/Makefile @@ -6,7 +6,7 @@ endif HIPCC = $(HIP_PATH)/bin/hipcc EXE = topo_expl -CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL +CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \ hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc diff --git a/tools/topo_expl/include/model.h b/tools/topo_expl/include/model.h index 13e66fbaf0..3b02c55ea2 100644 --- a/tools/topo_expl/include/model.h +++ b/tools/topo_expl/include/model.h @@ -69,7 +69,7 @@ public: int rankToCudaDev(int rank) { for (int i=0; inodes[GPU].nodes[i].gpu.rank[0]) + if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank) return systems[0]->nodes[GPU].nodes[i].gpu.dev; } return -1; @@ -77,7 +77,7 @@ public: int64_t getGpuBusId(int rank) { for (int i=0; inodes[GPU].nodes[i].gpu.rank[0]) + if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank) return systems[0]->nodes[GPU].nodes[i].id; } return -1; @@ -93,7 +93,7 @@ public: void setRanks() { for (int r=0; rnodes[GPU].nodes[i].gpu.rank[0] += firstRank; + systems[r]->nodes[GPU].nodes[i].gpu.rank += firstRank; } int p2pCanConnect(int device1, int device2) { return 1; } @@ -133,4 +133,4 @@ public: NetworkModel() : nRanks(0) {} }; -#endif +#endif \ No newline at end of file diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h index aab3a4487f..27737e2231 100644 --- a/tools/topo_expl/include/nccl.h +++ b/tools/topo_expl/include/nccl.h @@ -1,6 +1,7 @@ /************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,24 +13,25 @@ #include #define NCCL_MAJOR 2 -#define NCCL_MINOR 14 -#define NCCL_PATCH 3 +#define NCCL_MINOR 18 +#define NCCL_PATCH 1 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 21403 +#define NCCL_VERSION_CODE 21801 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #define RCCL_BFLOAT16 1 #define RCCL_GATHER_SCATTER 1 #define RCCL_ALLTOALLV 1 -#define RCCL_MULTIRANKPERGPU 1 #ifdef __cplusplus extern "C" { #endif /*! @brief Opaque handle to communicator */ +#include typedef struct ncclComm* ncclComm_t; +#define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; @@ -45,15 +47,24 @@ typedef enum { ncclSuccess = 0, ncclInProgress = 7, ncclNumResults = 8 } ncclResult_t; +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 + /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ -typedef struct ncclConfig_v21400 { +typedef struct ncclConfig_v21700 { /* attributes that users should never touch. */ size_t size; unsigned int magic; unsigned int version; /* attributes that users are able to customize. */ int blocking; + int cgaClusterSize; + int minCTAs; + int maxCTAs; + const char *netName; + int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -62,7 +73,12 @@ typedef struct ncclConfig_v21400 { sizeof(ncclConfig_t), /* size */ \ 0xcafebeef, /* magic */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ - 1 /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. @@ -117,28 +133,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /// @endcond -/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device. - - @details - rank must be between 0 and nranks-1 and unique within a communicator clique. - Each rank is associated to a HIP device, which has to be set before calling - ncclCommInitRankMulti. - Since this version of the function allows multiple ranks to utilize the same - HIP device, a unique virtualId per device has to be provided by each calling - rank. - ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be - called by different threads/processes or use ncclGroupStart/ncclGroupEnd. - - @param[in] - comm ncclComm_t* - communicator struct pointer - */ - ncclResult_t ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId); -/// @cond include_hidden - ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId); -/// @endcond - - /*! @brief Creates a clique of communicators (single process version). * * @details This is a convenience function to create a single-process communicator clique. @@ -177,6 +171,19 @@ ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /// @endcond +/*! @brief Creates one or more communicators from an existing one. + * Ranks with the same color will end up in the same communicator. + * Within the new communicator, key will be used to order ranks. + * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + * and will therefore return a NULL communicator. + * If config is NULL, the new communicator will inherit the original communicator's + * configuration*/ +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/// @cond include_hidden +ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/// @endcond + +/* Returns a string for each error code. */ /*! @brief Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); /// @cond include_hidden @@ -188,7 +195,7 @@ const char* pncclGetErrorString(ncclResult_t result); */ const char* ncclGetLastError(ncclComm_t comm); /// @cond include_hidden -const char* pncclGetError(ncclComm_t comm); +const char* pncclGetLastError(ncclComm_t comm); /// @endcond /* Checks whether the comm has encountered any asynchronous errors */ @@ -498,6 +505,44 @@ ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); /// @endcond +/*! @brief Opaque handle to MSCCL algorithm */ +typedef int mscclAlgoHandle_t; + +/*! @brief MSCCL Load Algorithm + * + * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return + * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL + * scheduler instead of end users. + */ +ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); + +/*! @brief MSCCL Run Algorithm + * + * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter + * list merges all possible parameters required by different operations as this + * is a general-purposed API. This API is expected to be called by MSCCL + * scheduler instead of end users. + */ +ncclResult_t mscclRunAlgo( + const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); +ncclResult_t pmscclRunAlgo( + const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); + +/*! @brief MSCCL Load Algorithm + * + * @details Unload MSCCL algorithm previous loaded using its handle. This API + * is expected to be called by MSCCL scheduler instead of end users. + */ +ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); + /* * Group semantics * diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h index 8ce2172727..2554380d80 100644 --- a/tools/topo_expl/include/utils.h +++ b/tools/topo_expl/include/utils.h @@ -8,8 +8,7 @@ #ifndef UTILS_H_ #define UTILS_H_ -// AllGather3 - begin -struct ncclGraphInfo { +struct graphInfo { int pattern; int nChannels; int sameChannels; @@ -19,14 +18,10 @@ struct ncclGraphInfo { int typeInter; }; -struct allGather3Data_t{ - int netDev; - int collNetSupport; - int nc; - struct ncclGraphInfo tree; - struct ncclGraphInfo ring; - struct ncclGraphInfo collNet; +struct allGatherInfo { + struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; struct ncclTopoRanks topoRanks; + int nc; bool pivotA2AEnabled; bool ll128Enabled; bool mscclEnabled; @@ -40,11 +35,11 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash); -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, - struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph); +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent = NULL); -ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, - struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph); +ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph); #define TIME_START(index) diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp index b677de49b3..f1d063ce63 100644 --- a/tools/topo_expl/model.cpp +++ b/tools/topo_expl/model.cpp @@ -216,10 +216,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } *((int*)connectInfo) = proxyRank; @@ -242,7 +242,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index bbb5eaff56..ea636a5492 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -153,10 +153,15 @@ NodeModelDesc model_descs[] = { {2, "topo_8p1h_5.xml", "2 nodes 8P1H Alt."}, }; +NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS); +NCCL_PARAM(MinCTAs, "MIN_CTAS", 1); + int main(int argc,char* argv[]) { struct ncclComm *comm; const int num_models = sizeof(model_descs) / sizeof(*model_descs); + int minCTAsEnv; + int maxCTAsEnv; if (!cmdOptionExists(argv, argv + argc, "-m")) { printf("Usage: ./topo_expl -m model_id\n"); @@ -200,18 +205,22 @@ int main(int argc,char* argv[]) node_model->rankToCudaDev(i), node_model->getGpuBusId(i)); } + minCTAsEnv = ncclParamMinCTAs(); + maxCTAsEnv = ncclParamMaxCTAs(); + NCCLCHECK(ncclCalloc(&comm, nranks)); struct ncclPeerInfo *peerInfo; NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root - struct allGather3Data_t *allGather3Data; + struct allGatherInfo* allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); - struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph; + struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph, *nvlsGraph; NCCLCHECK(ncclCalloc(&treeGraph, nranks)); NCCLCHECK(ncclCalloc(&ringGraph, nranks)); NCCLCHECK(ncclCalloc(&collNetGraph, nranks)); + NCCLCHECK(ncclCalloc(&nvlsGraph, nranks)); for (int i = 0; i < nranks; i++) { comm[i].rank = i; @@ -224,8 +233,23 @@ int main(int argc,char* argv[]) comm[i].topo = node_model->getSystem(i); comm[i].peerInfo = peerInfo; comm[i].ncclNet = ncclNet; - comm[i].virtualId = -1; - // Mark channels as non initialized. + comm[i].config.maxCTAs = maxCTAsEnv; + comm[i].config.minCTAs = minCTAsEnv; + if (comm[i].topParentRanks == NULL) { + NCCLCHECK(ncclCalloc(&comm[i].topParentRanks, comm->nRanks)); + for (int j = 0; j < comm->nRanks; ++j) + comm[i].topParentRanks[j] = j; + } + struct ncclSharedResources* sharedRes = NULL; + NCCLCHECK(ncclCalloc(&sharedRes, 1)); + /* most of attributes are assigned later in initTransportsRank(). */ + sharedRes->owner = &comm[i]; + sharedRes->tpNRanks = comm[i].nRanks; + NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm[i].nRanks)); + comm[i].sharedRes = sharedRes; + sharedRes->refCount = 1; + ncclMemoryStackConstruct(&comm[i].memPermanent); + // Mark channels as non initialized. for (int c=0; c #include #include +#include #include "xml.h" #include "coll_net.h" #include "model.h" @@ -37,8 +38,15 @@ const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; extern NodeModel *node_model; -NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); +RCCL_PARAM(CliqueIgnoreTopo, "CLIQUE_IGNORE_TOPO", 0); +RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0); +RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 1); +RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0); + NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); +NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); +NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0); +NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); thread_local int ncclDebugNoWarn = 0; ncclCollNet_t* ncclCollNet = NULL; @@ -74,6 +82,103 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id) { return ncclSuccess; } +void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) { + // `me->hunks` points to the top of the stack non-empty hunks. Hunks above + // this (reachable via `->above`) are empty. + struct Hunk* top = me->topFrame.hunk; + size_t mallocSize = 0; + + // If we have lots of space left in hunk but that wasn't enough then we'll + // allocate the object unhunked. + if (me->topFrame.end - me->topFrame.bumper >= 8<<10) + goto unhunked; + + // If we have another hunk (which must be empty) waiting above this one and + // the object fits then use that. + if (top && top->above) { + struct Hunk* top1 = top->above; + uintptr_t uobj = (reinterpret_cast(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align); + if (uobj + size <= reinterpret_cast(top1) + top1->size) { + me->topFrame.hunk = top1; + me->topFrame.bumper = uobj + size; + me->topFrame.end = reinterpret_cast(top1) + top1->size; + return reinterpret_cast(uobj); + } + } + + { // If the next hunk we're going to allocate wouldn't be big enough but the + // Unhunk proxy fits in the current hunk then go allocate as unhunked. + size_t nextSize = (top ? top->size : 0) + (64<<10); + constexpr size_t maxAlign = 64; + if (nextSize < sizeof(struct Hunk) + maxAlign + size) { + uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); + if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end) + goto unhunked; + } + + // At this point we must need another hunk, either to fit the object + // itself or its Unhunk proxy. + mallocSize = nextSize; + INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); + struct Hunk *top1 = (struct Hunk*)malloc(mallocSize); + if (top1 == nullptr) goto malloc_exhausted; + top1->size = nextSize; + top1->above = nullptr; + if (top) top->above = top1; + top = top1; + me->topFrame.hunk = top; + me->topFrame.end = reinterpret_cast(top) + nextSize; + me->topFrame.bumper = reinterpret_cast(top) + sizeof(struct Hunk); + } + + { // Try to fit object in the new top hunk. + uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align); + if (uobj + size <= me->topFrame.end) { + me->topFrame.bumper = uobj + size; + return reinterpret_cast(uobj); + } + } + +unhunked: + { // We need to allocate the object out-of-band and put an Unhunk proxy in-band + // to keep track of it. + uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); + Unhunk* proxy = reinterpret_cast(uproxy); + me->topFrame.bumper = uproxy + sizeof(Unhunk); + proxy->next = me->topFrame.unhunks; + me->topFrame.unhunks = proxy; + mallocSize = size; + proxy->obj = malloc(mallocSize); + INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); + if (proxy->obj == nullptr) goto malloc_exhausted; + return proxy->obj; + } + +malloc_exhausted: + WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize); + abort(); +} + +void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { + // Free unhunks first because both the frames and unhunk proxies lie within the hunks. + struct ncclMemoryStack::Frame* f = &me->topFrame; + while (f != nullptr) { + struct ncclMemoryStack::Unhunk* u = f->unhunks; + while (u != nullptr) { + free(u->obj); + u = u->next; + } + f = f->below; + } + // Free hunks + struct ncclMemoryStack::Hunk* h = me->stub.above; + while (h != nullptr) { + struct ncclMemoryStack::Hunk *h1 = h->above; + free(h); + h = h1; + } +} + int ncclDebugLevel = -1; void ncclDebugInit() { @@ -126,43 +231,60 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** return ncclSuccess; } +NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); void initCollNet() { if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0) ncclCollNet = (ncclCollNet_t*)0x12345678; } -ncclResult_t initChannel(struct ncclComm* comm, int channelid) { - struct ncclChannel* channel = comm->channels+channelid; +ncclResult_t initChannel(struct ncclComm* comm, int channelId) { + struct ncclChannel* channel = &comm->channels[channelId]; if (channel->id != -1) return ncclSuccess; - channel->id = channelid; - // Ring index to user rank table. - //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); - NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + int nRanks = comm->nRanks; + int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */; + channel->id = channelId; + channel->workFifoSent = 0; - // Communication structures with peers. - //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) - NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); - for (size_t i=0; inRanks+1; ++i) { - for (int b=0; bpeers[i].send[b].comm = comm; - channel->peers[i].recv[b].comm = comm; + struct ncclSharedResources* sharedRes = comm->sharedRes; + + //NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + + if (channel->peers == NULL) { + // The extra on nRanks+1 is for collnet root (i.e. network) + // Allocate everything related to sharedRes with ncclCalloc as this can be + // shared between communicators hence should not be tied to comm. + if (sharedRes->peers[channelId] == NULL) { + NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks)); + } + channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nPeers); + for (int r = 0; r < nRanks; r++) { + channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r]; + ncclAtomicRefCountIncrement(&channel->peers[r]->refCount); } } +#if 0 + if (channel->devPeers == NULL) { + if (sharedRes->devPeers[channelId] == NULL) { + NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); + } + /* channel->devPeers is not shared, so just free it when calling commFree() */ + NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); + ncclCommPushCudaFree(comm, channel->devPeers); + for (int r = 0; r < nRanks; r++) { + uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + } + } +#endif + channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + //NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); + //ncclCommPushCudaFree(comm, channel->devRingUserRanks); - // Per-channel operation list. - //NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); - //if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { - // GDRCOPY support - // We allocate a workFifo in GDR mapped CUDA memory - // But we still allocate the Host workFifo so that we - // can copy the work elements to CUDA memory on kernel launch - //NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc)); - //} else { - // The device workFifo is the Host one - //channel->workFifoDev = channel->workFifo; - //} + //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); + //CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream)); + //CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0)); return ncclSuccess; } @@ -213,8 +335,8 @@ template static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) { struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; - struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : - comm->channels[channelId].peers[peer].recv + connIndex; + struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex : + comm->channels[channelId].peers[peer]->recv + connIndex; // handle intra-node network connections int n1 = -1, n2 = -1; if (connIndex == NCCL_CONN_IDX_P2P_NET) { @@ -248,12 +370,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n uint64_t mask = 1UL << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue; comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue; comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask; } return ncclSuccess; @@ -272,9 +394,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // Stream used during transport setup; need for P2P pre-connect + CUDA Graph ncclResult_t ret = ncclSuccess; int highestType = TRANSPORT_P2P; // track highest transport type - struct ncclConnect data[2*MAXCHANNELS]; + struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect + struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel + struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel - //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail); + //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; @@ -282,22 +407,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)]; uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)]; - struct ncclConnect* recvData = data; + // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer + // This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers + // The first N entries contain recvData, connection information for recv connections + // The next M entries contain sendData, connection information for send connections + // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections + data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS); + recvData[i] = data[i]; int sendChannels = 0, recvChannels = 0; int type; TIME_START(0); for (int c=0; c(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); + NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } TIME_STOP(0); TIME_START(1); - struct ncclConnect* sendData = recvData+recvChannels; + sendData[i] = recvData[i]+recvChannels; for (int c=0; c(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); + NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } @@ -306,48 +437,93 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* TIME_START(2); if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { - //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); - //NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); - sendData = data; - recvData = data+sendChannels; + //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); + //NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); + sendData[i] = data[i]; + recvData[i] = data[i]+sendChannels; } } else { - //if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail); - //if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail); - //if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail); - //if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail); + //if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail); + //if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail); + //if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail); + //if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail); } TIME_STOP(2); - - TIME_START(3); - for (int c=0; cchannels[c].peers[sendPeer].send + connIndex; - //NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail); - conn->connected = 1; - //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail); - //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail); - } - } - TIME_STOP(3); - TIME_START(4); - for (int c=0; cchannels[c].peers[recvPeer].recv + connIndex; - //NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail); - conn->connected = 1; - //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail); - } - } - TIME_STOP(4); - comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL; } + // Loop until all channels with all ranks have been connected + bool allChannelsConnected; + allChannelsConnected = false; + while (!allChannelsConnected) { + allChannelsConnected = true; + for (int i=1; inRanks; i++) { + int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; + int sendPeer = (comm->rank + i) % comm->nRanks; + uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)]; + uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)]; + + int sendDataOffset = 0; + int recvDataOffset = 0; + for (int c=0; cchannels[c].peers[sendPeer]->send + connIndex; + // This connector hasn't completed connection yet + if (conn->connected == 0) { + //NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail); + if (ret == ncclSuccess) { + struct ncclDevChannelPeer* addr; + conn->connected = 1; + /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ + //CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); + //CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + } else if (ret == ncclInProgress) { + allChannelsConnected = false; + } + } + } + TIME_STOP(3); + + // Start with recv channels + TIME_START(4); + if (recvMask & (1UL<channels[c].peers[recvPeer]->recv + connIndex; + // This connector hasn't completed connection yet + if (conn->connected == 0) { + //NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail); + if (ret == ncclSuccess) { + struct ncclDevChannelPeer* addr; + conn->connected = 1; + /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ + //CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); + //CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + } else if (ret == ncclInProgress) { + allChannelsConnected = false; + } + } + } + TIME_STOP(4); + } + } + } + + // Clear all connect masks and free each connectInfo array + for (int i=1; inRanks; i++) { + int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; + int sendPeer = (comm->rank + i) % comm->nRanks; + comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL; + free(data[i]); + } + + free(data); + free(sendData); + free(recvData); + if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: - //NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream)); - //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream)); + //NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); + //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); return ret; fail: goto exit; @@ -381,7 +557,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN } // select - struct ncclChannelPeer* root = channel->peers+nranks; + struct ncclChannelPeer* root = channel->peers[nranks]; // connector index: 0 for recv, 1 for send struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); @@ -419,10 +595,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN } // connect if (isMaster) { - //NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); - struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks; - struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; - //CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup); + NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); + struct ncclDevChannelPeer* devRoot; + //CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup); + struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type; + //CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side if (isMaster && type == collNetRecv) { @@ -460,157 +637,60 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { // Free collNet resources for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; - struct ncclChannelPeer* peer = channel->peers+comm->nRanks; - for (int b=0; bsend + b; - //if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); - send->transportResources = NULL; // avoid double free - } - for (int b=0; brecv + b; - //if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); - recv->transportResources = NULL; // avoid double free + struct ncclChannelPeer* peer = channel->peers[comm->nRanks]; + if (peer) { + if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { + for (int b=0; bsend + b; + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); + send->transportResources = NULL; // avoid double free + } + for (int b=0; brecv + b; + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); + recv->transportResources = NULL; // avoid double free + } + } } } return ncclSuccess; } -RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0); -NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); -RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0); -NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0); -RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0); - -static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) { - ncclResult_t ret = ncclSuccess; - int* heads = NULL; - int rank = comm->rank; - int collNetSetupFail = 0; - int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P }; - // Find all head ranks - int nHeads = collNetGraph->nChannels; - int highestTransportType0, highestTransportType1; - char line[1024]; - - NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail); - // Head GPU index is always 0 - for (int c = 0; c < nHeads; c++) { - heads[c] = collNetGraph->intra[c * comm->localRanks + 0]; - } - - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - for (int h = 0; h < nHeads; h++) { - const int head = heads[h]; - collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv); - if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend); - } - // Verify CollNet setup across ranks after trying the first channel - if (c == 0) { - NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); - } - } - // Verify CollNet setup across ranks after trying all channels - NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); - TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); - - line[0] = '\0'; - for (int c = 0; c < comm->nChannels; c++) { - struct ncclTree* chain = &comm->channels[c].collnetChain; - snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d", - c, chain->down[0], rank, chain->up); - } - line[1023] = '\0'; - - INFO(NCCL_INIT, "Collnet Chains %s", line); - // Connect Collnet + chain - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail); - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail); - INFO(NCCL_INIT, "Connected collnet + chain"); - - // Connect intra-node CollNet + Direct - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channelRecv = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail); - - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channelSend = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail); - -#if 0 - // Exchange highest intra-node transport type among ranks - // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer - comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); - for (int i = 0; i < comm->localRanks; i++) { - if (highestTypes[i] > comm->intraHighestTransportType) - comm->intraHighestTransportType = highestTypes[i]; - } -#endif - INFO(NCCL_INIT, "rank %d Connected CollNet", rank); - -exit: - free(heads); - return ret; -fail: - ncclTransportCollNetFree(comm); - comm->collNetSupport = 0; - goto exit; -} - -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, - struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} // 2. { nChannels, graphInfo, topoRanks } ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; - //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); cpu_set_t affinitySave; - //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); - //NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail); + //struct ncclTopoGraph ringGraph; + //struct ncclTopoGraph treeGraph; + //struct ncclTopoGraph collNetGraph; + //struct ncclTopoGraph nvlsGraph; + struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; + + int nChannelsOrig; + struct ncclTopoRanks** allTopoRanks = NULL; + int *nodesFirstRank = NULL, *nodesTreePatterns = NULL; + int *rings = NULL; + int* nvbPeers = NULL; + struct ncclProxyConnector proxyConn; + int* pxnPeers = NULL; + int *topParentLocalRanks = NULL; + int tpProxyRank; // AllGather1 - begin //NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root //NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail); //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); - //If virtualId == -1 multiRank support has not been requested by user, using original interface - if (comm->virtualId == -1) { - for (int i = 0; i < nranks; i++) { - if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); - ret = ncclInvalidUsage; - goto fail; - } - } - } - else { - //Multiple ranks can use the same device, but need to have different virtualId's. - for (int i = 0; i < nranks; i++) { - for (int j=0; j < nranks; j++) { - if (j==i) continue; - if((comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash) && - (comm->peerInfo[i].busId == comm->peerInfo[j].busId) && - (comm->peerInfo[i].virtualId == comm->peerInfo[j].virtualId)) { - WARN("Duplicate virtualId detected : rank %d and rank %d both on GPU device %lx virtualId %d", - i, j, comm->peerInfo[rank].busId, comm->peerInfo[i].virtualId); - return ncclInvalidUsage; - } - } + for (int i = 0; i < nranks; i++) { + if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); + ret = ncclInvalidUsage; + goto fail; } } // AllGather1 - end @@ -618,6 +698,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t do { // Compute intra-process ranks int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; + for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap); + for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap); for (int i = 0; i < nranks; i++) { if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { @@ -682,8 +764,19 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t // sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); //} - // Launch proxy service thread - //NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); + // Determine local CollNet support + if (collNetSupport(comm)) { + char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); + if (collNetEnable != NULL) { + INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); + if (strcmp(collNetEnable, "1") == 0) { + comm->collNetSupport = 1; + } + } + } + + // Determine local Nvls support + //NCCLCHECK(ncclNvlsInit(comm)); // Get rings and trees ringGraph.id = 0; @@ -706,8 +799,24 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); + if (comm->collNetSupport) { + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); + } else { + collNetGraph.nChannels = 0; + } + + nvlsGraph.id = 3; + nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; + nvlsGraph.collNet = 0; + nvlsGraph.minChannels = 1; + nvlsGraph.maxChannels = MAXCHANNELS; + if (comm->nvlsSupport) { + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); + } else { + nvlsGraph.nChannels = 0; + } bool allXgmi, hasPeerAccess; allXgmi = true; @@ -735,22 +844,10 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; if (comm->rank == ncclParamGraphDumpFileRank()) { - struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; - NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail); + struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph }; + NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail); } - // Determine local CollNet support before all-gather - if (collNetSupport(comm)) { - char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); - if (collNetEnable != NULL) { - INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); - if (strcmp(collNetEnable, "1") == 0) { - comm->collNetSupport = 1; - } - } - } - if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0; - if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { if (rcclParamP2pNetDisable() == 0) { if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1; @@ -764,64 +861,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; - if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); - if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) allGather3Data[rank].nc = 4; - if (((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) || - (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) && + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; - NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail); - allGather3Data[rank].tree.pattern = treeGraph.pattern; - allGather3Data[rank].tree.nChannels = treeGraph.nChannels; - allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; - allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra; - allGather3Data[rank].tree.bwInter = treeGraph.bwInter; - allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; - allGather3Data[rank].tree.typeInter = treeGraph.typeInter; - allGather3Data[rank].ring.pattern = ringGraph.pattern; - allGather3Data[rank].ring.nChannels = ringGraph.nChannels; - allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; - allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra; - allGather3Data[rank].ring.bwInter = ringGraph.bwInter; - allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; - allGather3Data[rank].ring.typeInter = ringGraph.typeInter; - allGather3Data[rank].collNet.pattern = collNetGraph.pattern; - allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels; - allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; - allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra; - allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter; - allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; - allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter; - allGather3Data[rank].collNetSupport = comm->collNetSupport; allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable(); comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable(); allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled; allGather3Data[rank].mscclEnabled = comm->topo->mscclEnabled; - comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count) - ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels; - NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail); + for (int a=0; apattern; + allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels; + allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels; + allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra; + allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter; + allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra; + allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter; + } + + comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); fail: return ret; } -ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, - struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { +ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph) { + ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; - ncclResult_t ret; + cpu_set_t affinitySave; + + struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; + int nChannelsOrig; struct ncclTopoRanks** allTopoRanks = NULL; int *nodesFirstRank = NULL, *nodesTreePatterns = NULL; @@ -829,6 +913,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int* nvbPeers = NULL; struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; + int *topParentLocalRanks = NULL; + int tpProxyRank; //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); @@ -844,7 +930,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch - nodesTreePatterns[node] = allGather3Data[r].tree.pattern; + nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern; } comm->rankToNode[r] = node; } @@ -887,32 +973,22 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int nc; nc = allGather3Data[0].nc; for (int i=0; ipeerInfo[i].netDev = allGather3Data[i].netDev; allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks - treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); - treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); - treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra); - treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter); - treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); - treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); - ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); - ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); - ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra); - ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter); - ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); - ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); - collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); - collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); - collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra); - collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter); - collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); - collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); - comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled; comm->topo->ll128Enabled = comm->topo->ll128Enabled && allGather3Data[i].ll128Enabled; comm->topo->mscclEnabled = comm->topo->mscclEnabled && allGather3Data[i].mscclEnabled; + for (int a=0; anChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels); + graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels); + graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra); + graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter); + graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra); + graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); + } + if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; + if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0; } comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = @@ -941,8 +1017,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail); + NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nc), ret, fail); if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph)); // AllGather3 - end @@ -963,6 +1039,29 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t //NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail); + // Compute nChannels per peer for p2p + NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); + + /* until now, all info of comm should be known. We can initialize shared resources and + * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before + * all proxy operations. */ + if (comm->sharedRes->owner == comm) { + comm->sharedRes->tpNLocalRanks = comm->localRanks; + comm->sharedRes->magic = comm->magic; + comm->sharedRes->tpNChannels = comm->nChannels; + comm->sharedRes->tpP2pNChannels = comm->p2pnChannels; + memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks); + } + NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail); + for (int i = 0; i < comm->localRanks; ++i) { + int tpRank = comm->topParentRanks[comm->localRankToRank[i]]; + topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank]; + } + comm->topParentLocalRanks = topParentLocalRanks; + + // Launch proxy service thread, after this, the proxy calls can be used. + //NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); + // Connect with prev/next for each ring for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; @@ -993,39 +1092,46 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail); INFO(NCCL_INIT, "Connected all trees"); +#if 0 + // Setup NVLS + NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); + // And NVLS trees if needed + if (comm->nvlsSupport && comm->localRanks > 1) { + for (int c=0; cnvlsChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail); + INFO(NCCL_INIT, "Connected NVLS tree"); + } +#endif +#if CUDART_VERSION >= 12010 // Check if we can setup CollNet - if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph); + if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph); +#endif TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - do { - int myCompCap = comm->peerInfo[rank].cudaCompCap; - int minCompCap = myCompCap, maxCompCap = myCompCap; - for (int i = 0; i < nranks; i++) { - minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); - maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); - } - NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail); - } while(0); - - // Compute nChannels per peer for p2p - NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); + NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); #if 0 do { // Setup p2p structures in comm->tasks struct ncclTasks* tasks = &comm->tasks; - int nRanks = comm->nRanks; int node = comm->node; int nNodes = comm->nNodes; struct ncclNodeRanks *nodeRanks = comm->nodeRanks; int localRank = comm->localRank; - tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - int s=0, r=0; + // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8. + int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2); + tasks->p2pOrderSteps = comm->nNodes * steps; + tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); + int i=0; // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. for (int d=0; d <= nNodes/4; d++) { @@ -1035,18 +1141,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t sched_delta: int recvNode = (node+nNodes-delta)%nNodes; int sendNode = (node+delta)%nNodes; - int steps = comm->maxLocalRanks; for (int step=0; step < steps; step++) { int recvIndex = (localRank-step+steps)%steps; - if (recvIndex < nodeRanks[recvNode].localRanks) { - tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex]; - r++; - } + int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; + tasks->p2pRecvOrder[i] = recvRank; int sendIndex = (localRank+step)%steps; - if (sendIndex < nodeRanks[sendNode].localRanks) { - tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex]; - s++; - } + int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; + tasks->p2pSendOrder[i] = sendRank; + i++; } index++; if (index == 1 && deltas[1] == deltas[0]) index++; @@ -1058,7 +1160,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t goto sched_delta; } } - assert(s == nRanks && r == nRanks); + assert(i == tasks->p2pOrderSteps); } while (0); if (ncclParamNvbPreconnect()) { @@ -1070,35 +1172,37 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int channelId; for (int c=0; cp2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer].send[1].connected == 0) { + if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { comm->connectSend[peer] |= (1UL<p2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { + if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { comm->connectRecv[peer] |= (1UL<rank, &proxyConn), ret, fail); - //NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); + tpProxyRank = comm->topParentRanks[comm->rank]; + //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + //NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); // Then to remote ones when using PXN if (ncclPxnDisable(comm) == 0) { int nranks; NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); for (int r=0; rp2pnChannels, sizeof(int), NULL, 0), ret, fail); + tpProxyRank = comm->topParentRanks[pxnPeers[r]]; + //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + //NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } - #if 0 if (comm->intraRank == 0) { // Load ncclParamLaunchMode char* str = getenv("NCCL_LAUNCH_MODE"); @@ -1129,8 +1233,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t exit: //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - // Unlink proxy shm to make sure it will be properly cleaned up. - //ncclProxyShmUnlink(comm); + /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can + * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be + * properly cleaned up. */ + //if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank);