Merge remote-tracking branch 'nccl/master' into develop

[ROCm/rccl commit: abd0615351]
This commit is contained in:
Wenkai Du
2023-06-21 20:54:24 -07:00
83 changed files with 5798 additions and 4090 deletions
+3
View File
@@ -317,6 +317,8 @@ set(SRC_FILES
src/include/git_version.h
src/include/graph.h
src/include/group.h
src/include/ibvcore.h
src/include/ibvsymbols.h
src/include/ibvwrap.h
src/include/info.h
src/include/ipcsocket.h
@@ -379,6 +381,7 @@ set(SRC_FILES
src/misc/argcheck.cc
# src/misc/cudawrap.cc
# src/misc/gdrwrap.cc
src/misc/ibvsymbols.cc
src/misc/ibvwrap.cc
src/misc/ipcsocket.cc
src/misc/msccl/msccl_lifecycle.cc
+5
View File
@@ -12,6 +12,7 @@ DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
RDMA_CORE ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
@@ -106,3 +107,7 @@ endif
ifneq ($(PROFAPI), 0)
CXXFLAGS += -DPROFAPI
endif
ifneq ($(RDMA_CORE), 0)
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
endif
+2 -2
View File
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 17
NCCL_PATCH := 1
NCCL_MINOR := 18
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
View File
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
misc/ipcsocket.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
+72 -7
View File
@@ -222,7 +222,6 @@ struct bootstrapState {
int cudaDev;
int rank;
int nranks;
int virtualId;
uint64_t magic;
volatile uint32_t *abortFlag;
};
@@ -230,7 +229,6 @@ struct bootstrapState {
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
int rank = comm->rank;
int nranks = comm->nRanks;
int virtualId = comm->virtualId;
struct bootstrapState* state;
struct ncclSocket* proxySocket;
ncclSocketAddress nextAddr;
@@ -241,11 +239,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
state->rank = rank;
state->nranks = nranks;
state->abortFlag = comm->abortFlag;
state->virtualId = virtualId;
comm->bootstrap = state;
comm->magic = state->magic = handle->magic;
TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
// [RCCL] Register custom signal handlers if requested
RegisterSignalHandlers();
@@ -308,11 +305,79 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int prev, next;
ncclSocketAddress listenAddr, tmpAddr;
struct ncclSocket* proxySocket;
struct bootstrapState* state;
NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
state->rank = rank;
state->nranks = nranks;
state->abortFlag = comm->abortFlag;
comm->bootstrap = state;
comm->magic = state->magic = handle->magic;
prev = parentRanks[(rank-1+nranks)%nranks];
next = parentRanks[(rank+1)%nranks];
// Setup my sockets for the allgather ring and other p2p connections
NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
// Create socket for other ranks to contact me
NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail);
// Get addr from next rank
NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail);
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
// AllGather all listen handlers
NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail);
memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail);
if (parent->config.splitShare) {
/* map local rank to top parent local rank. */
for (int i = 0; i < nranks; ++i) {
comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
}
comm->proxyState = parent->sharedRes->proxyState;
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
} else {
// Create the service proxy
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
}
INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
exit:
return ret;
fail:
goto exit;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
@@ -344,7 +409,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
@@ -405,7 +470,7 @@ ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank,
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
+124 -29
View File
@@ -17,32 +17,122 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
channel->id = channelId;
channel->workFifoSent = 0;
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
struct ncclSharedResources* sharedRes = comm->sharedRes;
// The extra on nRanks+1 is for collnet root (i.e. network)
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
CUDACHECK(hipEventRecord(comm->deviceStream.scratchEvent, comm->deviceStream.cudaStream));
CUDACHECK(hipStreamWaitEvent(comm->deviceStream.cudaStream, comm->deviceStream.scratchEvent, 0));
for (int r=0; r < nPeers; ++r) {
for (int b=0; b < NCCL_MAX_CONNS; b++) {
channel->peers[r].send[b].comm = comm;
channel->peers[r].recv[b].comm = comm;
if (channel->peers == NULL) {
// The extra on nRanks+1 is for collnet root (i.e. network)
// Allocate everything related to sharedRes with ncclCalloc as this can be
// shared between communicators hence should not be tied to comm.
if (sharedRes->peers[channelId] == NULL) {
NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
}
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
for (int r = 0; r < nRanks; r++) {
channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
}
}
if (channel->devPeers == NULL) {
if (sharedRes->devPeers[channelId] == NULL) {
NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
}
/* channel->devPeers is not shared, so just free it when calling commFree() */
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
for (int r = 0; r < nRanks; r++) {
uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
}
}
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream));
CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
struct ncclChannel* channel = &comm->channels[channelId];
struct ncclSharedResources* sharedRes = comm->sharedRes;
if (channel->nvlsPeers != NULL)
return ncclSuccess;
if (channel->id == -1)
NCCLCHECK(initChannel(comm, channelId));
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
for (int r = 0; r < comm->localRanks; ++r) {
int tr = comm->topParentLocalRanks[r];
uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
}
} else {
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks));
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream));
for (int r = 0; r < comm->localRanks; ++r) {
uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
}
}
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
return ncclSuccess;
}
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
struct ncclChannel* channel = &comm->channels[channelId];
struct ncclSharedResources* sharedRes = comm->sharedRes;
uintptr_t addr;
if (channel->collnetPeers != NULL)
return ncclSuccess;
if (channel->id == -1)
NCCLCHECK(initChannel(comm, channelId));
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
if (share) {
channel->collnetPeers = parent->channels[channelId].collnetPeers;
channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
} else {
NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
addr = (uintptr_t)channel->collnetDevPeers;
channel->peers[comm->nRanks] = channel->collnetPeers;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
}
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
int nPeers = nRanks + collnetNRanks + nvlsNRanks;
/* channel peers are only valid when async init thread completes commAlloc() and
* the channel is intialized with initChannel(); if either is not done, this channel
* should never be free. */
@@ -50,18 +140,23 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
// Free transport proxy resources
// Note: free all send resources first due to CollNet arrangement
for (int r=0; r<nRanks+1; r++) {
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
for (int r = 0; r < nPeers; r++) {
struct ncclChannelPeer* peer = channel->peers[r];
if (peer) {
if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
if (r == nRanks) {
free(channel->collnetPeers);
ncclCudaFree(channel->collnetDevPeers);
} else if (r == nPeers - 1) {
free(channel->nvlsPeers);
ncclCudaFree(channel->nvlsDevPeers);
}
}
}
}
for (int r=0; r<nRanks+1; r++) {
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
}
return ncclSuccess;
}
@@ -51,7 +51,7 @@ namespace {
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -85,7 +85,7 @@ namespace {
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
prims.directSend(chunkOffset, offset, nelem);
} else {
prims.directCopySend(chunkOffset, offset, offset, nelem);
prims.directCopySend(chunkOffset, offset, nelem);
}
// k-2 steps: copy to next GPU
@@ -93,7 +93,7 @@ namespace {
rankDest = ringRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
// Make final copy from buffer to dest.
@@ -148,19 +148,19 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
if (tid < tidEndGather) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Bcast through MC
// Bcast through NVLS
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
+196 -49
View File
@@ -66,7 +66,7 @@ namespace {
}
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -158,7 +158,7 @@ namespace {
}
#endif
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
@@ -180,7 +180,7 @@ namespace {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
@@ -342,7 +342,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directSendFromOutput(offset, offset, nelem);
prims.directSendFromOutput(offset, nelem);
}
}
else if (tree->down[0] == -1) {
@@ -356,7 +356,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
}
@@ -446,7 +446,7 @@ namespace {
chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 3, max number of send is 3
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
@@ -467,7 +467,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true);
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
@@ -530,7 +530,8 @@ namespace {
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -557,7 +558,7 @@ namespace {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
prims.directRecvCopySend(offset, nelem);
}
}
@@ -621,9 +622,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
int group = (2*Proto::MaxGroupWidth) | (1<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -634,16 +635,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
}
} else if (tid >= tidStartReduce && direct->out != -1) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (args->regUsed) {
prims.directRecvReduceSend(offset, offset, nelem);
prims.directRecvReduceSend(offset, nelem);
} else {
prims.recvReduceSend(offset, nelem);
}
@@ -651,7 +652,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} else {
// Directly send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -660,29 +662,30 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
} else if (tid < tidStartBcast && hasUp) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
int group = (1*Proto::MaxGroupWidth) | (0<<16);
if (hasDn) {
// Recv from network, broadcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
}
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -705,23 +708,27 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const int reduceWarps = nranks <= 6 ? 6 : 4;
const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
const bool hasOut = nvls->out != -1;
const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasOut ? 2 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = copyWarps*WARP_SIZE;
const int nThreadsGather = (copyWarps-1)*WARP_SIZE;
const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
@@ -729,19 +736,136 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
}
} else if (tid < tidEndGather) {
// Gather
int group = (2*Proto::MaxGroupWidth) | (0<<16);
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce, broadcast through NVLS
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const int treeUp = nvls->treeUp;
const int* treeDown = nvls->treeDown;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasUp = treeUp != -1;
const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasUp ? 4 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasUp) {
// Reduce and Broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -762,21 +886,26 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
ncclTree *tree = &ncclShmem.channel.collnetChain;
ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t loopSize = int(nChannels*chunkSize);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t size = args->count;
int nthreadsSplit = nthreads/2;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
int group, send, recv, groupTid, groupNthreads;
int group, connIndex, send, recv, groupTid, groupNthreads;
using Proto = ProtoSimple<1, 1>;
if (tid < nthreadsSplit) {
group = (0*Proto::MaxGroupWidth) | (1<<16);
// Reduce up the chain
group = 0;
connIndex = 1;
recv = tree->down[0];
send = tree->up;
groupTid = tid;
groupNthreads = nthreadsSplit;
} else {
group = (1*Proto::MaxGroupWidth);
// Broadcast down the chain
group = 1;
connIndex = 0;
recv = tree->up;
send = tree->down[0];
groupTid = tid - nthreadsSplit;
@@ -784,7 +913,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group);
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
if (tid < nthreadsSplit) {
if (recv == -1) {
@@ -802,17 +932,34 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
}
else {
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecv(offset, nelem);
if (recv == nranks) {
// I'm the first in the broadcast chain, I need to perform the division (postOp)
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem, /*postOp*/true);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
}
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, offset, nelem);
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecv(offset, nelem);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, nelem);
}
}
}
}
@@ -51,7 +51,7 @@ namespace {
if (num_hops == 0 && args->sendbuff != args->recvbuff) {
const T* sendbuff = (const T*)args->sendbuff + send_offset;
T* recvbuff = (T *)args->recvbuff + recv_offset;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(
reduceCopy<COLL_UNROLL, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
tid, nthreads, 0, nullptr, false, 1, (void **)&sendbuff, 1, (void **)&recvbuff, send_recv_size);
} else {
for (ssize_t prims_offset = 0; prims_offset < send_recv_size; prims_offset += prims_size) {
@@ -50,7 +50,7 @@ namespace {
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
+41 -40
View File
@@ -42,7 +42,8 @@
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
@@ -119,8 +120,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
#endif
};
static_assert(FUNC_INDEX_P2P == 4510, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 4511, "Wrong AllToAllPivot function index");
static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");
#ifndef USE_INDIRECT_FUNCTION_CALL
template<unsigned short f, unsigned short l, bool u>
@@ -180,46 +181,46 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
else
assert("Unsupported function index");
#else
if (funcIndex < 900) {
if (funcIndex % 15 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (funcIndex % 15 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (funcIndex % 15 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 15 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
if (funcIndex < 1080) {
if (funcIndex % 18 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (funcIndex % 18 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (funcIndex % 18 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 18 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 1800) Caller<900, 1800, USING_LL128>::call(funcIndex);
else if (funcIndex < 2700) {
if (funcIndex % 15 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (funcIndex % 15 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (funcIndex % 15 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 15 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (funcIndex < 2160) Caller<1080, 2160, USING_LL128>::call(funcIndex);
else if (funcIndex < 3240) {
if (funcIndex % 18 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (funcIndex % 18 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (funcIndex % 18 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 18 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 4500) Caller<2700, 4500, USING_LL128>::call(funcIndex);
else if (funcIndex < 5400) Caller<3240, 5400, USING_LL128>::call(funcIndex);
else {
switch (funcIndex - 4500) {
switch (funcIndex - 5400) {
case 0:
ncclFunction_OneRankReduce_PreMulSum_int8_t();
break;
@@ -353,7 +354,6 @@ struct ncclShmemGroup {
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
int nvlsRecv;
uint64_t barrier;
uint64_t barrier_next[NCCL_MAX_GROUPS];
};
@@ -621,7 +621,8 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
IMPL_COLL4(func, RING, devredop, type) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type) \
IMPL_COLL4(func, NVLS, devredop, type)
IMPL_COLL4(func, NVLS, devredop, type) \
IMPL_COLL4(func, NVLS_TREE, devredop, type)
#define IMPL_COLL2(func, devredop) \
IMPL_COLL3(func, devredop, int8_t) \
@@ -28,7 +28,8 @@ inline __device__ int loadInt(int* ptr) {
}
template<typename RedFn, typename T, int Unroll, int BytePerPack,
int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void reduceCopyPacks(
int nThreads, int &thread,
@@ -37,6 +38,7 @@ __device__ __forceinline__ void reduceCopyPacks(
IntBytes &nBytesBehind, IntBytes &nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
//if (BytePerPack == 0) __trap();
// A hunk is the amount of contiguous data a warp consumes per loop iteration
// assuming all threads partake.
@@ -49,15 +51,15 @@ __device__ __forceinline__ void reduceCopyPacks(
IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack);
IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack);
// Number of hunks to be consumed over all warps.
IntBytes nHunksAhead = nBytesAhead/BytePerHunk;
IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk);
// Advance collective position.
nBytesBehind += nHunksAhead*BytePerHunk;
nBytesAhead -= nHunksAhead*BytePerHunk;
if (Unroll==1 && BytePerPack <= nBytesAhead) {
// Only Unroll=1 can do partial hunks (where not all threads partake).
nHunksAhead += 1;
nBytesBehind += nBytesAhead - (nBytesAhead%BytePerPack);
nBytesAhead = nBytesAhead%BytePerPack;
nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack));
nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack);
}
nHunksAhead -= warp;
@@ -79,8 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks(
{ RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
if (0 < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
}
minSrcs[0] += WARP_SIZE*BytePerPack;
if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
}
@@ -92,8 +99,13 @@ __device__ __forceinline__ void reduceCopyPacks(
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
}
minSrcs[s] += WARP_SIZE*BytePerPack;
}
#pragma unroll Unroll
@@ -130,7 +142,11 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int d=0; d < MinDsts; d++) {
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
st_global<BytePerPack>(minDsts[d], acc[u]);
if (d < MultimemDsts) {
multimem_st_global(minDsts[d], acc[u]);
} else {
st_global<BytePerPack>(minDsts[d], acc[u]);
}
minDsts[d] += WARP_SIZE*BytePerPack;
}
}
@@ -167,215 +183,61 @@ __device__ __forceinline__ void reduceCopyPacks(
}
template<int Unroll, typename RedFn, typename T,
int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void ReduceOrCopyMulti(
__device__ __forceinline__ void reduceCopy(
int thread, int nThreads,
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
IntBytes nElts
) {
static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
//int nWarps = nThreads/WARP_SIZE;
//int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
// Check that all is 16B aligned. If not don't use 16B load/stores.
int aligned = 1;
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane])%4;
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane])%4;
aligned = !(__any(!aligned));
// If a multimem src is present then our biggest pack size is limited to what
// is supported for this redfn/type.
constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
IntBytes nBytesBehind = 0;
IntBytes nBytesAhead = nElts*sizeof(T);
if (aligned) {
reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), /*BytePerPack=*/16,
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/16,
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
#if __cpp_if_constexpr
if constexpr (BigPackSize > sizeof(T)) {
#else
if (BigPackSize > sizeof(T)) {
#endif
// Check that all pointers are BigPackSize aligned.
bool aligned = true;
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
aligned = !(__any(!aligned));
if (aligned) {
reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
}
}
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
}
// Copies from srcAddr to dstAddr using multimem load/store. The amount copied
// will be at most Unroll*BytePerPack*WARP_SIZE. If Partial=1, then the amount
// will be the min() of that and nBytesAhead. If srcAddr is not BytePerPack
// aligned then the amount copied will be less by (srcAddr%BytePerPack) since
// we begin loads at the first pack containing the first element.
template<typename RedFn, typename T, int Unroll, int BytePerPack,
bool SrcAligned, // is srcAddr aligned to BytePerPack
bool DstAligned, // are dstAddr and nBytesAhead both aligned to BytePerPack
bool Partial, // is this a possibly partial hunk
typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_WarpUnrolled(
int lane, RedFn redFn, bool postOp, uintptr_t srcAddr, uintptr_t dstAddr,
IntBytes nBytesAhead, uint32_t scratchAddr
) {
#if 0
int srcMisalign = SrcAligned ? 0 : srcAddr%BytePerPack;
srcAddr -= srcMisalign;
BytePack<BytePerPack> reg[Unroll];
int offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || (offset < srcMisalign + nBytesAhead)) {
reg[u] = applyLoadMultimem(redFn, srcAddr+offset);
if (postOp) reg[u] = applyPostOp(redFn, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
if (SrcAligned && DstAligned) {
offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || offset < nBytesAhead) {
multimem_st_global<BytePerPack>(dstAddr+offset, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
} else {
__syncwarp();
offset = lane*BytePerPack;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (!Partial || (offset < srcMisalign + nBytesAhead)) {
st_shared<BytePerPack>(scratchAddr+offset, reg[u]);
}
offset += WARP_SIZE*BytePerPack;
}
__syncwarp();
if (!SrcAligned) {
// Ignore the beginning of the first pack corresponding to bytes overread
// due to misalignment.
nBytesAhead = min(nBytesAhead, Unroll*WARP_SIZE*BytePerPack - srcMisalign);
}
copyGlobalShared_WarpUnrolled
<sizeof(T), /*MaxBytes=*/Unroll*WARP_SIZE*BytePerPack, /*Multimem=*/1>
(lane, dstAddr, scratchAddr+srcMisalign, nBytesAhead);
}
#endif
}
// copyMultimemMultimem_IfEnabled has two overloads: the enabled case whose first arg
// has type `std::true_type` and the disabled case with first arg `std::false_type`.
// This is to guard the template instantiations of Apply_LoadMultimem on types/ops where
// they aren't supported. A nicer approach is to use C++17's "if constexpr".
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
std::false_type enabled/*=false*/,
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
// nop
}
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
std::true_type enabled/*=true*/,
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
constexpr int BytePerPack = Apply_LoadMultimem<RedFn>::PackSize;
using T = typename RedFn::EltType;
constexpr int Unroll = ncclNvlsUnroll(BytePerPack);
constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack;
int nWarps = nThreads/WARP_SIZE;
int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
RedFn redFn(redArg);
uintptr_t srcAddr = cvta_to_global(srcPtr);
uintptr_t dstAddr = cvta_to_global(dstPtr);
IntBytes warpBytesAhead = nElts*sizeof(T);
bool partialHunkIsFront;
// First handle misalignment of srcAddr.
if ((BytePerPack != sizeof(T)) && (srcAddr%BytePerPack != 0)) {
// If srcAddr isn't pack aligned then the first hunk processed will be short
// the same number of bytes as srcAddr's misalignment.
if (warp == 0) {
partialHunkIsFront = true;
goto PartialHunk; // "call" PartialHunk()
PartialHunkFrontReturn:
warp = nWarps;
}
warp -= 1; // Rotate warp numbers for load balancing
int advanced = BytePerHunk-(srcAddr%BytePerPack); // since copyMultimemMultimem_WarpUnrolled shorts by the misalignment
srcAddr += advanced; // srcAddr is now pack aligned
dstAddr += advanced;
warpBytesAhead -= advanced;
}
warpBytesAhead -= warp*BytePerHunk;
srcAddr += warp*BytePerHunk;
dstAddr += warp*BytePerHunk;
// Now that srcAddr is pack aligned detect if dstAddr is pack aligned.
if ((BytePerPack == sizeof(T)) || (dstAddr%BytePerPack == 0)) {
while (BytePerHunk <= warpBytesAhead) {
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/true, /*Partial=*/false>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
srcAddr += nWarps*BytePerHunk;
dstAddr += nWarps*BytePerHunk;
warpBytesAhead -= nWarps*BytePerHunk;
}
} else {
while (BytePerHunk <= warpBytesAhead) {
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/false, /*Partial=*/false>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
srcAddr += nWarps*BytePerHunk;
dstAddr += nWarps*BytePerHunk;
warpBytesAhead -= nWarps*BytePerHunk;
}
}
if (0 < warpBytesAhead) {
partialHunkIsFront = false;
goto PartialHunk; // "call" PartialHunk()
PartialHunkBackReturn:;
}
return;
PartialHunk:
// We have to handle a partial hunk possibly at the front and back of the
// buffer. We generate the code once here since its a lot of instructions,
// and then simulate function calls with gotos.
copyMultimemMultimem_WarpUnrolled
<RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/false, /*DstAligned=*/false, /*Partial=*/true>
(lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
if (partialHunkIsFront) goto PartialHunkFrontReturn;
goto PartialHunkBackReturn;
}
template<typename RedFn, typename IntBytes>
__device__ __forceinline__ void copyMultimemMultimem(
int thread, int nThreads, uint64_t redArg, bool postOp,
void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
) {
constexpr bool Enabled = Apply_LoadMultimem<RedFn>::PackSize != 0;
copyMultimemMultimem_IfEnabled<RedFn>(
/*enabled=*/std::integral_constant<bool, Enabled>(),
thread, nThreads, redArg, postOp, srcPtr, dstPtr, nElts, warpScratchAddr);
}
#endif // COMMON_KERNEL_H_
@@ -26,7 +26,8 @@ __shared__ ncclShmemData ncclShmem;
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t
@@ -42,7 +42,7 @@ namespace {
dst += i0;
void *vsrc = (void*)src;
void *vdst = (void*)dst;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
}
}
+39 -4
View File
@@ -7,6 +7,8 @@
#ifndef OP128_H_
#define OP128_H_
#include <type_traits>
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
v0 = __builtin_nontemporal_load(ptr);
v1 = __builtin_nontemporal_load(ptr+1);
@@ -88,6 +90,8 @@ __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
template<int Size>
union BytePack;
template<>
union BytePack<0> {};
template<>
union BytePack<1> {
uint8_t u8, native;
};
@@ -130,14 +134,26 @@ union alignas(16) BytePack<16> {
};
template<typename T>
__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value) {
union { BytePack<sizeof(T)> p; T v; };
struct BytePackOf {
static constexpr int Size = sizeof(T);
using Pack = BytePack<Size>;
};
template<>
struct BytePackOf<BytePack<0>> {
static constexpr int Size = 0;
using Pack = BytePack<0>;
};
template<typename T>
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
union { typename BytePackOf<T>::Pack p; T v; };
v = value;
return p;
}
template<typename T>
__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack) {
union { BytePack<sizeof(T)> p; T v; };
__device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack) {
union { typename BytePackOf<T>::Pack p; T v; };
p = pack;
return v;
}
@@ -152,6 +168,13 @@ template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
//template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
//template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
//template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
//template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
@@ -255,6 +278,18 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<>
__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) {
// nop
}
template<>
__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
}
@@ -37,13 +37,14 @@
* to how that protocol operates with a consistent interface so that our
* algorithm code can operate protocol parametrically.
*/
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, int MultimemSrcs_1 = 0, int MultimemDsts_1 = 0>
struct ProtoSimple {
static constexpr int Id = NCCL_PROTO_SIMPLE;
static constexpr int SlicePerChunk = SlicePerChunk_1;
static constexpr int StepPerSlice = StepPerSlice_1;
static constexpr int Unroll = Unroll_1;
static constexpr bool NVLS = NVLS_1;
static constexpr int MultimemSrcs = MultimemSrcs_1;
static constexpr int MultimemDsts = MultimemDsts_1;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
@@ -55,9 +56,6 @@ struct ProtoSimple {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return 1;
}
};
struct ProtoLL {
@@ -73,9 +71,6 @@ struct ProtoLL {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return 1;
}
};
struct ProtoLL128 {
@@ -91,9 +86,6 @@ struct ProtoLL128 {
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
__device__ static int calcGroupWidth(bool send, int nthreads) {
return 1;
}
};
/* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template
@@ -133,22 +125,22 @@ class Primitives;
// Used by LL & LL128 to implement direct members in the naive way.
template<typename RealPrimitives>
struct PrimitivesWithoutDirect {
__device__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
__device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->send(inpIx, eltN);
}
__device__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
}
__device__ void directRecv(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
}
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
}
__device__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
}
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
@@ -566,24 +566,24 @@ private:
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
int connIndex = group >> 16;
auto *channel = &ncclShmem.channel;
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
@@ -32,6 +32,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
const int wid;
const int stepSize;
const int warp;
const int warpInBlock; // warp index in thread block
const bool flagThread;
const int group;
Fan fan;
@@ -488,23 +489,24 @@ private:
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
warpInBlock(threadIdx.x/WARP_SIZE),
flagThread((tid%4)==3), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
int connIndex = group >> 16;
auto *channel = &ncclShmem.channel;
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
@@ -13,9 +13,9 @@
#include "msccl/msccl_struct.h"
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@@ -31,10 +31,9 @@ class Primitives<
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800,
NvlsMinPolling = 0x1000,
NvlsRecv = 0x2000;
NvlsMinPolling = 0x1000;
const int tid, tidInBlock;
int nthreads;
const int nthreads;
int nworkers;
const int stepSize;
Fan fan;
@@ -93,19 +92,19 @@ private:
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (NVLS && (flags & NvlsMinPolling)) {
if (flags & NvlsMinPolling) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
#endif
// volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
// volatile is faster than acquire but not as correct. Make sure reduceCopy
// loads data using volatile so it doesn't see stale data in L1.
return atomicAdd((unsigned long long *)ptr, 0);
}
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
@@ -132,7 +131,7 @@ private:
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
if (flags & DirectWrite) {
ptrs[index] = directBuff + remoteIx + offset;
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
ptrs[index] = nullptr;
} else {
@@ -140,7 +139,7 @@ private:
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & DirectRead) {
ptrs[index] = directBuff + remoteIx + offset;
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
@@ -173,7 +172,7 @@ private:
template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
__device__ __forceinline__ void genericOp(
intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp
intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp
) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
@@ -217,17 +216,12 @@ private:
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (NVLS && ncclShmem.groups[group].nvlsRecv) {
void* src = ncclShmem.groups[group].srcs[0];
void* dst = ncclShmem.groups[group].dsts[0];
copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
} else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
@@ -244,7 +238,7 @@ private:
}
#endif
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
@@ -280,7 +274,7 @@ private:
}
#endif
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
@@ -316,7 +310,9 @@ private:
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
reduceCopy<Unroll, RedOp, T,
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
@@ -370,19 +366,19 @@ private:
srcs[nsrcs] = dsts[0];
nsrcs++;
if (MULTISRCS){
ReduceOrCopyMulti<Unroll, RedOp, T, 3, MSCCL_MAX_REDUCE_FUSION, 1, 1, 0>
reduceCopy<Unroll, RedOp, T, 0, 3, MSCCL_MAX_REDUCE_FUSION, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem);
} else {
ReduceOrCopyMulti<Unroll, RedOp, T, 2, 2, 1, 1, 0>
reduceCopy<Unroll, RedOp, T, 0, 2, 2, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem);
}
}
if (COPY){
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem);
if (MULTISRCS) {
for (int i = 1; i < nsrcs; i++){
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem);
}
}
@@ -425,7 +421,7 @@ private:
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
fenceNeeded |= true;
}
@@ -437,18 +433,15 @@ private:
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
subBarrier();
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
// Do nothing
} else {
for (int j=0; j<fan.nrecv(); j++) {
int i = (j+shift)%fan.nrecv();
pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
#pragma unroll 1
for (int j=0; j<fan.nrecv(); j++) {
int i = (j+shift)%fan.nrecv();
pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
}
}
@@ -469,14 +462,7 @@ private:
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
if ((index == 0) && (flags & RoleWaitRecv)) {
if (conn->flags & NCCL_NVLS_MIN_POLL) {
flags |= NvlsMinPolling;
ncclShmem.groups[group].nvlsRecv = 1;
} else {
ncclShmem.groups[group].nvlsRecv = 0;
}
}
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -554,18 +540,16 @@ private:
public:
__forceinline__ __device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
):
tid(tid), tidInBlock(threadIdx.x),
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nthreads = nthreads;
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
this->nworkers = nthreads;
this->group = group & (uint16_t)0xFFFF;
int connIndex = group >> 16;
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -595,8 +579,8 @@ private:
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
@@ -707,62 +691,62 @@ private:
}
__device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, -1, eltN, false);
genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false);
}
__device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, -1, eltN, false);
genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false);
}
__device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
genericOp<0, 1, 0, 1, Input, -1>(inpIx, -1, remoteOutIx, eltN, false);
__device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false);
}
__device__ __forceinline__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
genericOp<0, 1, 0, 1, Output, -1>(outIx, -1, remoteOutIx, eltN, false);
__device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false);
}
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, /*postOp=*/false);
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
}
__device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
__device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp);
}
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false);
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
}
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp);
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp);
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp);
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void
@@ -31,7 +31,7 @@ namespace {
const int root = args->root;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
int realChunkSize;
@@ -56,9 +56,14 @@ struct Apply_PostOp/*{
static BytePack<EltPerPack*sizeof(T)> postOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
}*/;
template<typename Fn>
struct LoadMultimem_BigPackSize/*{
// If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem,
// otherwise there are no valid pack sizes for LoadMultimem.
static constexpr int BigPackSize = 0;
}*/;
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem/*{
static constexpr int PackSize; // 0 if not implemented
static BytePack<PackSize> load(Fn fn, uintptr_t addr);
static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
}*/;
////////////////////////////////////////////////////////////////////////////////
@@ -70,7 +75,7 @@ struct Apply_LoadMultimem/*{
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
return fromPack<Pack>(
Apply_Reduce<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::reduce(fn, toPack(a), toPack(b))
);
}
@@ -78,7 +83,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PreOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::preOp(fn, toPack(a))
);
}
@@ -86,19 +91,27 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PostOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::postOp(fn, toPack(a))
);
}
template<typename Fn>
__device__ __forceinline__ BytePack<Apply_LoadMultimem<Fn>::PackSize> applyLoadMultimem(Fn fn, uintptr_t addr) {
return Apply_LoadMultimem<Fn>::load(fn, addr);
template<typename Fn, int BytePerPack>
__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
}
////////////////////////////////////////////////////////////////////////////////
// Apply_Reduce
// Nonsensical base case
template<typename Fn>
struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
__device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
return {};
}
};
// General recursive definition (EltPerPack > 1). This is how we iterate over
// all elements in a pack of any size, by breaking it into halves. Eventually
// we'll hit a base case (a more specific template specialization which takes
@@ -283,6 +296,14 @@ struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
// Apply_PostOp
@@ -316,6 +337,14 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
@@ -506,11 +535,6 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
////////////////////////////////////////////////////////////////////////////////
// Apply_LoadMultimem
template<typename Fn>
struct Apply_LoadMultimem {
static constexpr int PackSize = 0; // Indicates not implemented
};
#define SIZEOF_BytePack_field_u16 2
#define PTX_REG_BytePack_field_u16 "h"
@@ -522,11 +546,11 @@ struct Apply_LoadMultimem {
#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>> { \
static constexpr int PackSize = 1*(SIZEOF_BytePack_field_##pack_field); \
struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.global." #op "." #ptx_ty " %0, [%1];" \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
return ans; \
@@ -534,11 +558,11 @@ struct Apply_LoadMultimem {
};
#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>> { \
struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@@ -547,8 +571,45 @@ struct Apply_LoadMultimem {
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem {
__device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
//__trap();
return {};
}
};
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<typename Fn>
struct LoadMultimem_BigPackSize {
using T = typename Fn::EltType;
static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
std::is_same<Fn, FuncPreMulSum<T>>::value ||
std::is_same<Fn, FuncSumPostDiv<T>>::value;
static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
std::is_same<Fn, FuncMax<T>>::value;
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
static constexpr int BigPackSize =
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
IsFloat && IsSum ? 8 :
IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
/*multimem.ld_reduce not supported:*/ 0;
};
DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
@@ -565,23 +626,30 @@ struct Apply_LoadMultimem {
DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
DEFINE_Apply_LoadMultimem_v4(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMax, half, max, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
#if defined(__CUDA_BF16_TYPES_EXIST__)
DEFINE_Apply_LoadMultimem_v4(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4(FuncMax, __nv_bfloat16, max, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
#endif
#else
template<typename Fn>
struct LoadMultimem_BigPackSize {
static constexpr int BigPackSize = 0;
};
#endif
#undef DEFINE_Apply_LoadMultimem
#undef DEFINE_Apply_LoadMultimem_v4
#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
#undef SIZEOF_BytePack_field_u64
#undef PTX_REG_BytePack_field_u64
#undef SIZEOF_BytePack_field_u32
@@ -30,7 +30,7 @@ namespace {
const ssize_t size = args->count;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
@@ -113,19 +113,19 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce through MC
// Reduce through NVLS
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -15,7 +15,7 @@
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
__device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
@@ -58,9 +58,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
#endif
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -80,7 +79,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -114,7 +113,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
template<typename Proto>
__device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
__device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
@@ -142,7 +141,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -189,11 +188,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
// warpStarts were rounded thanks to int division, but for group number we need to round the other way around
// So we mirror wid then mirror again the group.
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
args += group;
tid -= args->warpStart * WARP_SIZE;
int nthreads = args->nWarps * WARP_SIZE;
group |= (args->connIndex<<16); // Used to select connIndex 1
if (args->p2pType == ncclWorkP2pTypeUnused) return;
if (tid >= nthreads || args->peer == -1) return;
+2
View File
@@ -74,6 +74,8 @@ void ncclDebugInit() {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "PROXY") == 0) {
mask = NCCL_PROXY;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) {
+73 -55
View File
@@ -167,12 +167,13 @@ static void finishWork(struct ncclWork* work, int WarpSize) {
static void appendWorkElemP2p(
struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
struct ncclWorkElemP2p const *elem
struct ncclWorkElemP2p const *elem, bool fuseOk
) {
constexpr int funcIndex = FUNC_INDEX_P2P;
struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
if (q && funcIndex == q->work.header.funcIndex) {
if (!fuseOk) goto NewWork;
if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
// Can't have multiple elements of the same ncclWork communicate with the
@@ -301,7 +302,7 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
// ensure *nWorkBudget >= 1 upon entry.
static ncclResult_t addP2pToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex
bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex, bool fuseOk
) {
struct ncclInfo info = {
isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
@@ -316,7 +317,7 @@ static ncclResult_t addP2pToPlan(
// 1 is connIndex
struct ncclConnInfo* conn = isSendNotRecv ?
&comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn;
&comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn;
info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
struct ncclProxyOp proxyOp = {};
@@ -337,7 +338,7 @@ static ncclResult_t addP2pToPlan(
elem.connIndex = connIndex;
*nWorkBudget += plan->channels[channelId].nWork;
appendWorkElemP2p(comm, plan, channelId, &elem);
appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk);
*nWorkBudget -= plan->channels[channelId].nWork;
// Calculate the opCount after appendWorkElemP2p since it will always return
@@ -508,7 +509,7 @@ static ncclResult_t scheduleCollTasksToPlan(
info.sliceSteps = head->sliceSteps;
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
if (nAggOps > 1) {
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
info.algorithm = aggInfo.algorithm;
@@ -533,7 +534,7 @@ static ncclResult_t scheduleCollTasksToPlan(
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
}
int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
tasks->nTasksColl -= 1;
@@ -584,17 +585,22 @@ static ncclResult_t scheduleP2pTasksToPlan(
// Try to use all channels
int nChannelsMax = comm->p2pnChannelsPerPeer;
int nChannelsMin = nChannelsMax;
// Try to use all channels, but one channel per operation.
while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
if (comm->nNodes == 1) {
// Try to use all channels, but one channel per operation.
while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
}
bool fuseOk;
// We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries.
while (tasks->nTasksP2p != 0) {
for (int i=0; i < nRanks; i++) {
for (int i=0; i < tasks->p2pOrderSteps; i++) {
int sendPeer = sendOrder[i];
int recvPeer = recvOrder[i];
struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue);
struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue);
if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false;
struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL;
struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL;
if (sendPeer == comm->rank) {
if (recvPeer != comm->rank) {
WARN("Sendrecv plan not aligned for self");
@@ -639,7 +645,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (recvChunkBytes != 0) {
if (recvChunkBytes == -1) recvChunkBytes = 0;
if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx));
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx, fuseOk));
fuseOk = true;
recvPtr += recvChunkBytes;
recvBytes -= recvChunkBytes;
recv->chunk += 1;
@@ -652,7 +659,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (sendChunkBytes != 0) {
if (sendChunkBytes == -1) sendChunkBytes = 0;
if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx));
NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx, fuseOk));
fuseOk = true;
sendPtr += sendChunkBytes;
sendBytes -= sendChunkBytes;
send->chunk += 1;
@@ -785,12 +793,12 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
}
static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
uint64_t collOpCount = comm->collOpCount;
uint64_t collOpCount = comm->sharedRes->collOpCount;
// Advance comm's collOpCount by number of colls in this plan.
comm->collOpCount = collOpCount + plan->collOpCount;
comm->sharedRes->collOpCount += plan->collOpCount;
for (int c=0; c < plan->channelUbound; c++) {
struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
uint64_t p2pOpCount = comm->channels[c].p2pOpCount;
uint64_t p2pOpCount = comm->sharedRes->p2pOpCount[c];
uint64_t nextP2pOpCount = p2pOpCount;
while (q != nullptr) {
struct ncclProxyOp* qNext = q->enqNext;
@@ -813,7 +821,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
q = qNext;
}
// Advance channel's p2pOpCount by number of p2p's in this plan channel.
comm->channels[c].p2pOpCount = nextP2pOpCount;
comm->sharedRes->p2pOpCount[c] = nextP2pOpCount;
}
return ncclSuccess;
}
@@ -932,15 +940,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
// The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
// at least one of the two streams to be strong-stream.
cudaStream_t launchStream = tasks->streams->stream;
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
if (tasks->numStreams != 1) {
// Create dependency for device stream on user streams. First from extra user
// streams to deviceStream. Then deviceStream to first user stream.
for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
}
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
} else if (tasks->streams->stream != comm->lastStream && comm->lastStream != nullptr) {
// Stream changed from last call, create dependency against last NCCL kernel launch
CUDACHECK(hipStreamWaitEvent(tasks->streams->stream, comm->doneEvent, 0));
@@ -954,15 +962,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
if (plan->hasProxyOps) {
if (!acquired) {
acquired = true;
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
}
NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure);
NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
}
}
if (acquired) {
// Make to-be-launched kernels dependent on just-launched host stream tasks.
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
}
}
@@ -1011,7 +1019,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) {
int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[3];
@@ -1083,7 +1091,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
// Create dependency for deviceStream on launchStream. We know that deviceStream
// hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
// so we can say that launchStream subsumes it.
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
resume1:
// Create dependency for other user streams (skip launch stream) on deviceStream.
// Again, the user streams haven't been touched since deviceStream waited on them
@@ -1091,13 +1099,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
struct ncclCudaStreamList* sl = tasks->streams->next;
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
while (sl != nullptr && tasks->numStreams != 1) {
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
resume2:
sl = sl->next;
}
tasks->numStreams = 0;
// Release device stream as acquired in ncclLaunchPrepare()
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume3);
NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
resume3:;
}
return result;
@@ -1108,13 +1116,9 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
/*****************************************************************************/
static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
if (info->comm->collNetSupport > 0) {
// Translate ncclAvg and PreMulSum
ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
NCCLCHECK(collNetReduceSupport(info->comm, info->datatype, netOp, collNetTypeSupport));
} else {
*collNetTypeSupport = 0;
}
// Translate ncclAvg and PreMulSum
ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
*collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype];
return ncclSuccess;
}
@@ -1134,6 +1138,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
@@ -1167,7 +1173,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
}
ncSwitch /= 2;
}
} else if (info->algorithm == NCCL_ALGO_NVLS) {
} else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
// NVLS should not need more than 16 channels to get peak BW.
nc = comm->nvlsChannels;
} else {
@@ -1185,12 +1191,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
if (info->protocol == NCCL_PROTO_SIMPLE) {
nt += WARP_SIZE; // Extra warp for sync
if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
// More threads or sync warps needed due to split thread model
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
}
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
#endif
@@ -1234,11 +1237,15 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
case ncclFuncReduceScatter:
case ncclFuncAllGather:
info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
ncclPatternRing; break;
case ncclFuncAllToAllPivot:
info->pattern = ncclPatternRing; break;
case ncclFuncAllReduce:
info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@@ -1258,14 +1265,17 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollnetChain:
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
case ncclPatternNvls:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
case ncclPatternCollnetDirect:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternNvlsTree:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
default:
WARN("Unknown pattern %d", info->pattern);
return ncclInternalError;
@@ -1348,13 +1358,22 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
if (chunkSize > 131072) chunkSize = 131072;
int maxChunkSize = 131072;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (64 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (1 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -1383,8 +1402,7 @@ comp_next:
proxyOp->chunkSize = chunkSize;
proxyOp->protocol = info->protocol;
proxyOp->dtype = info->datatype;
proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet
info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
info->op;
proxyOp->pattern = info->pattern;
proxyOp->root = info->root;
@@ -1502,20 +1520,20 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
int channelId;
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
if (isSendNotRecv) {
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
} else {
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
ncclGroupCommPreconnect(comm);
}
@@ -1550,7 +1568,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
t->chunkSteps = info->chunkSteps;
t->sliceSteps = info->sliceSteps;
ncclIntruQueueEnqueue(&tasks->collQueue, t);
tasks->collBytesTotal += t->count*ncclTypeSize(t->datatype);
tasks->collBytesTotal += info->nBytes;
tasks->nTasksColl += 1;
}
}
@@ -1611,10 +1629,10 @@ exit:
NCCLCHECK(ncclGroupEndInternal());
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
* so we have to check state here. */
if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
return ret;
fail:
if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret);
if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
goto exit;
}
+194 -99
View File
@@ -15,15 +15,10 @@
/********************* Internode connection ***********************/
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks) {
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
int localRanks = 0;
for (int i=0; i<comm->topo->nodes[GPU].count; i++) {
localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu;
}
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
@@ -39,9 +34,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
int* collNetIntra = collNetGraph->intra+c*localRanks;
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
@@ -52,8 +48,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
}
if (treeIntra[i] == rank) {
int parentIndex = 0;
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
@@ -68,6 +64,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
topoRanks->nvlsHeads[c] = nvlsIntra[0];
}
// Duplicate channels rings/trees
struct ncclChannel* channel0 = comm->channels;
@@ -79,10 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
int nChannels = comm->nChannels;
int localRanks = 0;
for (int i=0; i<comm->topo->nodes[GPU].count; i++) {
localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu;
}
int localRanks = comm->topo->nodes[GPU].count;
//new tree
for (int c=0; c<nChannels; c++) {
int* treeIntra = treeGraph->intra+c%3*localRanks;
@@ -120,26 +114,26 @@ ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
return ncclSuccess;
}
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for (int c=0; c<nChannels; c++) {
int* recv = ringRecv+c*comm->nRanks;
int* send = ringSend+c*comm->nRanks;
int* recv = ringRecv+c*comm->nNodes;
int* send = ringSend+c*comm->nNodes;
int* prev = ringPrev+c*comm->nRanks;
int* next = ringNext+c*comm->nRanks;
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
for (int n=0; n<nNodes; n++) {
int recvRank = recv[firstRanks[n]];
int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
int recvRank = recv[n];
int prevSendRank = send[(n-1+nNodes)%nNodes];
prev[recvRank] = prevSendRank;
if (comm->rank == recvRank) {
channel0->ring.prev = prevSendRank;
if (channel1) channel1->ring.prev = prevSendRank;
}
int sendRank = send[firstRanks[n]];
int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
int sendRank = send[n];
int nextRecvRank = recv[(n+1)%nNodes];
next[sendRank] = nextRecvRank;
if (comm->rank == sendRank) {
channel0->ring.next = nextRecvRank;
@@ -152,8 +146,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
return ncclSuccess;
}
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
return ncclSuccess;
}
@@ -175,42 +169,38 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
return ncclSuccess;
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = (comm->nChannels > MAXCHANNELS/2) ? comm->nChannels/2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
int* ranksToParent, *ranksToChild0, *ranksToChild1;
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int* ttp, *ttc0, *ttc1;
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
if (comm->nChannels <= MAXCHANNELS/2) {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
@@ -219,64 +209,63 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
} else {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
}
channel0->tree.depth = depth;
}
for (int c=nChannels; c<nChannels*2; c++) {
struct ncclChannel* channel1 = comm->channels+c;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel1->tree.depth = depth;
}
}
free(ranksToParent);
free(ranksToChild0);
free(ranksToChild1);
return ncclSuccess;
}
static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nHeads = collNetGraph->nChannels;
int nHeads = 0;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
NCCLCHECK(ncclCalloc(&heads, localRanks));
// Find all head ranks
// Head index is always 0
for (int c=0; c<nHeads; c++) {
for (int c=0; c<collNetGraph->nChannels; c++) {
int* collNetIntra = collNetGraph->intra+c*localRanks;
heads[c] = collNetIntra[0];
int head = collNetIntra[0];
for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
if (head != -1) heads[nHeads++] = collNetIntra[0];
}
// For all channels
for (int c=0; c<comm->nChannels; c++) {
@@ -315,10 +304,96 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
INFO(NCCL_GRAPH, "%s", line);
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
}
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
}
free(heads);
return ncclSuccess;
}
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
int nHeads = nvlsGraph->nChannels;
int headRank = -1;
for (int h=0; h<nHeads; h++) {
if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
}
if (nHeads == 0) {
comm->nvlsChannels = 0;
return ncclSuccess;
}
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nHeads;
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
channel->nvls.down = comm->nRanks+1+headRank;
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
channel->nvls.node = comm->node;
channel->nvls.nNodes = comm->nNodes;
}
if (comm->nNodes == 1) return ncclSuccess;
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
int pc0, pc1; // ignored
NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
&tree0Parent, &tree0Child0, &tree0Child1, &pc0,
&tree1Parent, &tree1Child0, &tree1Child1, &pc1));
int* heads = NULL;
int treeUp[2] = { -1, -1 };
int treeDown0[2] = { -1, -1 };
int treeDown1[2] = { -1, -1 };
if (comm->node == 0) {
for (int h=0; h<nHeads; h++) {
char line[1024];
sprintf(line, "NVLS Head %2d:", h);
heads = nvlsHeads+h*comm->nNodes;
for (int n=0; n<comm->nNodes && n<20; n++) {
sprintf(line+strlen(line), " %2d", heads[n]);
}
INFO(NCCL_INIT, "%s", line);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for (int h=0; h<nHeads; h++) {
heads = nvlsHeads+h*comm->nNodes;
if (heads[comm->node] == comm->rank) {
treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
break;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.treeUp = treeUp[c%2];
channel->nvls.treeDown[0] = channel->nvls.down;
int ix = 1;
if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
}
struct ncclNvls* nvls0 = &comm->channels[0].nvls;
struct ncclNvls* nvls1 = &comm->channels[1].nvls;
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp,
nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp);
return ncclSuccess;
}
// Legacy naming
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
@@ -360,33 +435,40 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
return c;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
for (int i=0; i<nranks; i++) {
for (int c=0; c<nChannels;c++) {
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
for (int c=0; c<nChannels;c++) {
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
}
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));
// Duplicate ringPrev/ringNext for ncclBuildRing
if (nChannels <= MAXCHANNELS/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -400,6 +482,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Setup CollNet
if (comm->collNetSupport == 1) {
struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
@@ -408,10 +491,21 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
NCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(nc, ncclMinNchannels()), ringPrev, ringNext);
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
@@ -423,6 +517,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
}
+35 -49
View File
@@ -273,7 +273,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
if (intermediateNode->type == GPU) {
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank[0];
if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
}
}
@@ -409,7 +409,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
if (distance == PATH_PXN) {
// In case of PXN, use the intermediate GPU distance instead
int proxyRank, g;
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank[0], netDev, &proxyRank));
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
distance = proxyGpu->paths[NET][n].type;
@@ -489,7 +489,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
return ncclInternalError;
}
*intermediateRank = node->gpu.rank[0];
*intermediateRank = node->gpu.rank;
} else {
*intermediateRank = rank;
}
@@ -563,6 +563,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
}
// Set direct paths to NVSwitches.
for (int n=0; n<system->nodes[NVS].count; n++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -578,10 +583,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
if (comm == NULL) continue;
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if (p2p == 0) {
@@ -589,7 +594,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
if (shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
}
}
}
@@ -603,32 +608,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable(comm) != 1) {
int pxnGpu = -1;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
int localGpuIndex;
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
if (localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink
(peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC
gpu->paths[NET][n].type <= PATH_PXB)) // or avoids going through a CPU
continue;
pxnGpu = p;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev));
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
if (netDev == netNode->id) break;
}
if (pxnGpu != -1) {
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
@@ -659,16 +652,11 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
domains[g] = g;
ids[g] = gpu->id;
for (int p=0; p<g; p++) {
if (gpu->paths[GPU][p].count > 0) {
if (gpu->paths[GPU][p].type < PATH_NET) {
domains[g] = std::min(domains[g], domains[p]);
}
}
for (int j=0; j<gpu->gpu.nRanksPerGpu; j++ ) {
if (gpu->gpu.rank[j] == comm->rank) {
myDomain = domains[g];
break;
}
}
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
@@ -732,7 +720,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
for (int g = 0; g < system->nodes[GPU].count; g++) {
int net;
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if (!gdr) break;
}
@@ -742,16 +730,12 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
}
}
if (rcclParamEnableIntranet()) {
remove = 0;
system->type |= RCCL_TOPO_FORCE_INTRA;
}
comm->localRanks = 0;
for (int n=0; n<system->nodes[GPU].count; n++ ) {
comm->localRanks += system->nodes[GPU].nodes[n].gpu.nRanksPerGpu;
}
if (comm->localRanks == comm->nRanks && remove) {
comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks && remove) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
@@ -808,8 +792,14 @@ static int nextPow2(int v) {
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
if (comm->sharedRes->owner != comm) {
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels);
} else {
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
}
int minChannels = comm->p2pnChannels;
// We need to loop through all local GPUs to have a global picture
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
@@ -857,14 +847,10 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
int nvbGpus = 0;
for (int g=0; g<ngpus; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int j=0;
for ( ; j<gpu->gpu.nRanksPerGpu; j++ ){
if (gpu->gpu.rank[j] == rank) break;
}
if ( j == gpu->gpu.nRanksPerGpu ) continue;
if (gpu->gpu.rank != rank) continue;
for (int p=0; p<ngpus; p++) {
if (gpu->paths[GPU][p].type == PATH_NVB) {
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank[j];
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
}
}
}
+6 -6
View File
@@ -691,7 +691,7 @@ ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct n
if (g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if (j < ngpus)
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank[0];
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
else
return ncclInternalError;
}
@@ -725,7 +725,7 @@ end:
if (graph->id == 1) {
for (int i=0; i<graph->nChannels; i++) {
int net;
ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], &net);
ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], i, &net);
graph->inter[i*2+1] = net;
}
}
@@ -788,7 +788,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str
break;
if (j < ngpus)
{
graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank[0];
graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank;
y=r;
}
else
@@ -926,15 +926,15 @@ ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGrap
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank[0]) break;
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank[0]) break;
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank[0]) break;
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
+221 -174
View File
@@ -13,6 +13,8 @@
#include <sys/time.h>
#include "rome_models.h"
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
// max bw.
static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
@@ -109,15 +111,26 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
if (type1 == -1) return ncclSuccess;
struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
struct ncclTopoLinkList* path = node1->paths[type2]+index2;
struct ncclTopoNode* node2 = system->nodes[type2].nodes+index2;
struct ncclTopoLinkList* revPath = node2->paths[type1]+index1;
if (path == NULL) {
WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
return ncclInternalError;
}
if (path->count == 0 ) return ncclSuccess;
// Now check link type
*node = NULL;
int intra = type1 == GPU && type2 == GPU;
int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
float bw = intra ? graph->bwIntra : graph->bwInter;
int type = intra ? graph->typeIntra : graph->typeInter;
if (mult == 1 && (path->type > type)) return ncclSuccess;
if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) &&
(revPath->type > type)) return ncclSuccess;
bw *= mult;
@@ -186,11 +199,9 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) {
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) {
if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
*index = g;
return ncclSuccess;
}
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
*index = g;
return ncclSuccess;
}
}
WARN("Could not find gpu rank %d", rank);
@@ -259,7 +270,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
// Try to keep all searchs within one second
#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18)
#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16)
#define NCCL_SEARCH_TIMEOUT (1<<14)
#define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
@@ -272,13 +283,9 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
if (graph->nChannels == 0) return ncclInternalError;
int ngpus = system->nodes[GPU].count;
int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
for (int i=0; i<ngpus; i++) {
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) {
*g = i;
return ncclSuccess;
}
}
for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
*g = i;
return ncclSuccess;
}
if (*g == -1) return ncclInternalError;
return ncclSuccess;
@@ -308,26 +315,18 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph
int n = graph->intra[ngpus*c+((i+1)%ngpus)];
struct ncclTopoNode *node;
int j;
for (j=0; j<ngpus; j++) {
bool found=false;
for (int k=0; k<system->nodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) {
if (system->nodes[GPU].nodes[j].gpu.rank[k] == g)
found = true;
}
if (found) break;
}
for (j=0; j<ngpus; j++)
if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
if (j<ngpus) {
node = system->nodes[GPU].nodes+j;
for (int k = 0; k<system->nodes[GPU].count; k++) {
if (node->paths[GPU][k].count == 1) {
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
struct ncclTopoNode* remNode = link->remNode;
for (int l=0; l<remNode->gpu.nRanksPerGpu; l++) {
if (remNode->gpu.rank[l] == n) {
if (link->type == LINK_NVL)
count ++;
}
}
if (remNode->gpu.rank == n) {
if (link->type == LINK_NVL)
count ++;
}
}
}
}
@@ -336,17 +335,57 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph
return count;
}
ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
struct ncclTopoNode* nvs;
struct ncclTopoNode* gpu;
int d0=0; // See if there is enough bandwidth for NVS->GPU traffic
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
d0++;
} while (gpu && d0 < system->nodes[GPU].count);
if (gpu == NULL) {
d0--;
} else {
int d1=0; // See if there is enough bandwidth for GPU->NVS traffic
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
d1++;
} while (nvs && d1 < system->nodes[GPU].count);
if (nvs == NULL) {
d1--;
} else { // Both directions worked. Move on to the next path.
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
}
while (d1) {
d1--;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
}
}
while (d0) {
d0--;
NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
}
return ncclSuccess;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 1. Constraint to get the same nChannels between Rings and Trees
// 1. Try to get the same nChannels between Rings and Trees
if (graph->nChannels < graph->minChannels) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
return ncclSuccess;
}
// 2. Try to get better bandwidth
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
// Give a 15% perf bonus to paths not crossing nics
float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
*copy = 1;
return ncclSuccess;
}
// 3. Less hops (but not at the price of going cross NICs)
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
// 3. Less hops
if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
// 4. Prefer graph with more XGMI connections
@@ -426,7 +465,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
graph->nChannels--;
return ncclSuccess;
}
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank[0];
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
int g = gpu - system->nodes[GPU].nodes;
if (step == backToNet) {
// first get back to NIC
@@ -467,6 +506,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
}
free(nets);
}
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
int next[NCCL_TOPO_MAX_NODES];
@@ -512,7 +553,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
struct ncclTopoNode* gpu;
if (graph->collNet && net->net.collSupport == 0) continue;
if (net->net.bw < bw) continue;
if (net->net.maxChannels == 0) continue;
graph->inter[graph->nChannels*2] = net->id;
graph->latencyInter = net->net.latency;
@@ -523,59 +563,63 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
system->nodes[NET].nodes[i].net.bw -= bw;
}
}
net->net.maxChannels--;
// First try to replay the last channel
if (graph->nChannels > 0) {
int g;
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
struct ncclTopoLinkList* paths = net->paths[GPU];
int f = 0, f_gdr = 0;
// find the first GPU that is closest to NIC
for (int i = 0; i<system->nodes[GPU].count; i++) {
if (paths[i].count <= paths[f].count) {
// prefer GPU direct RDMA
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
f = i;
f_gdr = gdr;
// NVLS needs to balance on all NICs
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
} else {
if (graph->nChannels > 0) {
// Try to replay the last channel
int g;
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
struct ncclTopoLinkList* paths = net->paths[GPU];
int f = 0, f_gdr = 0;
// find the first GPU that is closest to NIC
for (int i = 0; i<system->nodes[GPU].count; i++) {
if (paths[i].count <= paths[f].count) {
// prefer GPU direct RDMA
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
f = i;
f_gdr = gdr;
}
}
}
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
if (t == -1) *time = -1;
}
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
if (t == -1) *time = -1;
}
// Then try the most local GPUs
float maxBw = 0;
int minHops = 0xfffffff;
struct ncclTopoLinkList* paths = net->paths[GPU];
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].bw > maxBw) {
maxBw = paths[g].bw;
minHops = paths[g].count;
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
minHops = paths[g].count;
// Then try the most local GPUs
float maxBw = 0;
int minHops = 0xfffffff;
struct ncclTopoLinkList* paths = net->paths[GPU];
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].bw > maxBw) {
maxBw = paths[g].bw;
minHops = paths[g].count;
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
minHops = paths[g].count;
}
}
}
if (maxBw >= bw) {
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].bw == maxBw && paths[g].count == minHops) {
gpu = system->nodes[GPU].nodes+g;
int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
if (tryGpuBidir == gpuUsed) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
if (maxBw >= bw) {
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].bw == maxBw && paths[g].count == minHops) {
gpu = system->nodes[GPU].nodes+g;
int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
if (tryGpuBidir == gpuUsed) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
}
}
}
}
@@ -583,7 +627,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
net->net.maxChannels++;
for (int i=0; i<system->nodes[NET].count; i++) {
if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
(system->nodes[NET].nodes[i].net.port == net->net.port)) {
@@ -634,7 +677,10 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
// Intra-node only.
if (graph->nChannels == 0) {
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
return ncclSuccess;
} else if (graph->nChannels == 0) {
// Try PCI order first
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
} else {
@@ -683,7 +729,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st
} else if (strcmp(sub->name, "gpu") == 0) {
int rank = -1;
for (int g=0; g<ngpus; g++) {
if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank[0];
if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
}
if (rank == -1) {
WARN("XML Import Channel : dev %d not found.", dev);
@@ -701,7 +747,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
int crossNic;
NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess;
graph->crossNic = crossNic;
NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
@@ -744,9 +790,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
int dev = -1;
for (int i=0; i<ngpus; i++) {
for ( int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
if (system->nodes[GPU].nodes[i].gpu.rank[j] == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
}
if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
}
if (dev == -1) {
WARN("XML Export Channel : rank %d not found.", intra[g]);
@@ -795,50 +839,39 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
#else
float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#endif
float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
#endif
static void ncclExpandMultiRank(ncclTopoSystem* system, struct ncclTopoGraph* graph)
{
// Expand the intra array to the multi-ranks per node scenario
int ngpus = system->nodes[GPU].count;
int intraCpy[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
TRACE(NCCL_GRAPH, "TopoCompute: expanding intra array for multi-rank per GPU scenarios nChannels %d", graph->nChannels);
memcpy(intraCpy, graph->intra, ngpus*sizeof(int)*graph->nChannels);
int tk=0;
for (int n=0; n<graph->nChannels; n++ ) {
for (int i=0; i<ngpus; i++) {
for (int j=0; j<ngpus; j++) {
if (intraCpy[n*ngpus+i] == system->nodes[GPU].nodes[j].gpu.rank[0] ) {
for (int k=0; k<system->nodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) {
graph->intra[tk++] = system->nodes[GPU].nodes[j].gpu.rank[k];
}
}
}
}
}
}
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
graph->crossNic = ncclParamCrossNic();
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
graph->nChannels = 0;
graph->sameChannels = 1;
graph->nIntraChannels = 0;
memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int));
int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
graph->sameChannels = trySameChannels;
char* str = getenv("NCCL_GRAPH_FILE");
if (str) {
@@ -850,10 +883,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
free(xml);
if (graph->nChannels > 0) {
ncclExpandMultiRank(system, graph);
return ncclSuccess;
}
if (graph->nChannels > 0) return ncclSuccess;
}
str = getenv("NCCL_RINGS");
@@ -866,29 +896,17 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
} else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
// try to match 8P6L
NCCLCHECK(parseChordalRing(system, graph));
if (graph->nChannels) {
ncclExpandMultiRank(system, graph);
return ncclSuccess;
}
if (graph->nChannels) return ncclSuccess;
// try to match Rome 4P2H
NCCLCHECK(parseRome4P2H(system, graph));
if (graph->nChannels) {
ncclExpandMultiRank(system, graph);
return ncclSuccess;
}
if (graph->nChannels) return ncclSuccess;
// try to match 1H16P
NCCLCHECK(parse1H16P(system, graph));
if (graph->nChannels) {
ncclExpandMultiRank(system, graph);
return ncclSuccess;
}
if (graph->nChannels) return ncclSuccess;
// try to match 4H4P
NCCLCHECK(parse4H4P(system, graph));
}
if (graph->nChannels) {
ncclExpandMultiRank(system, graph);
return ncclSuccess;
}
if (graph->nChannels) return ncclSuccess;
if ((graph->pattern == NCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
// limit single node max channels when searching ring graph on Rome
@@ -898,6 +916,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
}
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
@@ -914,7 +940,10 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
int pass = 1;
int speedIndex = 0;
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
float maxBw = system->maxBw;
float totalBw = system->totalBw;
if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
@@ -948,12 +977,19 @@ search:
tmpGraph.sameChannels = 0;
goto search;
}
tmpGraph.sameChannels = 1;
tmpGraph.sameChannels = trySameChannels;
if (time != -1) globalTimeout += time;
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
if (globalTimeout < 0 && graph->nChannels) goto done;
// Try a simpler tree
if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
goto search;
}
tmpGraph.pattern = graph->pattern;
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
tmpGraph.typeIntra += 1;
@@ -974,20 +1010,13 @@ search:
}
tmpGraph.crossNic = 0;
// Try a simpler tree
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
goto search;
}
tmpGraph.pattern = graph->pattern;
// Decrease bw until we find a solution
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
goto search;
}
speedIndex = 0;
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
}
@@ -1016,24 +1045,26 @@ done:
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
}
if (graph->nChannels == 0 && graph->collNet == 0) {
if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank[0];
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
graph->bwIntra = graph->bwInter = 0.1;
graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
if (graph->bwIntra >= 25.0) {
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
}
ncclExpandMultiRank(system, graph);
if (graph->nChannels == 0) return ncclSuccess;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
if (graph->bwIntra < 25.0) return ncclSuccess;
if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
return ncclSuccess;
}
@@ -1085,23 +1116,40 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
int localRanks = comm->topo->nodes[GPU].count;
for (int c=0; c<graph->nChannels; c++) {
if (graph->intra[c*localRanks] == comm->rank) {
*dev = graph->inter[c*2];
return ncclSuccess;
}
}
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
return ncclInternalError;
}
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
#include "comm.h"
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
int ngpus = comm->topo->nodes[GPU].count;
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
*dev = graph->inter[channel*2+index];
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
*dev = graph->inter[channel*2+index];
} else {
NCCLCHECK(getNvlsNetDev(comm, graph, dev));
}
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
} else if (peerRank == -1) {
return ncclInternalError;
} else {
// Start with our local NIC and local Rank
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev));
*proxyRank = rank;
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
@@ -1111,7 +1159,9 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
int cudaDev = comm->peerInfo[peerRank].cudaDev;
int localRank;
if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
int netDev = comm->peerInfo[localRank].netDev;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
int n;
// Check that device exists on our node
if (ncclParamCrossNic() == 0) {
@@ -1131,20 +1181,17 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
}
} else if (pxnLevel == 2) {
// Check whether we can access it through our node-local GPU for that NIC.
for (int r=0; r<comm->localRanks; r++) {
int peerRank = comm->localRankToRank[r];
if (comm->peerInfo[peerRank].netDev == netDev) {
int g1, g2, n;
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerRank;
*dev = netDev;
return ncclSuccess;
}
// Check which local GPU corresponds to that NIC and see if we can use PXN.
int n, g1, g2;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2));
if (g2 != -1) {
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerGpu->gpu.rank;
*dev = netDev;
return ncclSuccess;
}
}
}
+129 -58
View File
@@ -117,10 +117,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = NCCL_TOPO_UNDEF;
for (int i=0; i<RCCL_TOPO_MAX_RANKS_PER_GPU; i++) {
n->gpu.rank[i] = NCCL_TOPO_UNDEF;
}
n->gpu.nRanksPerGpu = NCCL_TOPO_UNDEF;
n->gpu.rank = NCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
} else if (type == CPU) {
n->cpu.arch = NCCL_TOPO_UNDEF;
@@ -256,15 +253,7 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
sprintf(line+offset, "%s/%lX (%d", topoNodeTypeStr[node->type], node->id, node->gpu.rank[0]);
int nextOffset;
int nextRank = 1;
while ( nextRank < node->gpu.nRanksPerGpu ) {
nextOffset = strlen(line);
sprintf(line+nextOffset, "/%d", node->gpu.rank[nextRank++]);
}
nextOffset = strlen(line);
sprintf(line+nextOffset, ")");
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
@@ -384,17 +373,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
rcclHipDeviceArch_t arch;
NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
//NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
const char *rankStr;
NCCLCHECK(xmlGetAttrStr(xmlGpu, "rank", &rankStr));
char *tmpStr;
char *token = strtok_r ( (char *)rankStr, ",", &tmpStr);
gpu->gpu.nRanksPerGpu = 0;
while (token != NULL && gpu->gpu.nRanksPerGpu < RCCL_TOPO_MAX_RANKS_PER_GPU) {
gpu->gpu.rank[gpu->gpu.nRanksPerGpu++] = atoi(token);
token = strtok_r(NULL, ",", &tmpStr);
}
NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
// Do not go any further, nvlinks will be added in a second pass
@@ -406,7 +385,6 @@ struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
const char* str;
@@ -716,8 +694,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
//NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlSetOrAppendAttrInt(node, "rank", r));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
}
@@ -744,11 +721,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
}
}
if (netDevCount == 0) {
NCCLCHECK(ncclNetDevices(comm, &netDevCount));
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, n, &props));
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -777,10 +754,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
int g;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
int minType = PATH_SYS;
static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
int* nets;
@@ -790,20 +765,115 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i
if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
maxBw = path->bw;
minType = path->type;
if (type) *type = minType;
count = 0;
}
if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
if (count == 0) {
*id = -1;
free(nets);
*localNetMask = 0ULL;
for (int n=0; n<count; n++) {
if (nets[n] >= 64) return ncclInternalError;
*localNetMask |= 1ULL<<nets[n];
}
free(nets);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
uint64_t* localNetMasks;
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
// Fill localNetMasks for all GPUs.
for (int g=0; g<ngpus; g++) {
NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
}
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int netLocalGpus = 0, netLocalGpu = 0;
for (int g=0; g<ngpus; g++) {
if (localNetMasks[g] == localNetMasks[gpu]) {
if (g == gpu) netLocalGpu = netLocalGpus;
netLocalGpus++;
}
}
uint64_t localNetMask = localNetMasks[gpu];
free(localNetMasks);
if (localNetMask == 0) return ncclInternalError;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (gIndex == netLocalGpu && cId == channelId) {
*id = n;
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
int ngpus = system->nodes[GPU].count;
int* gpus;
NCCLCHECK(ncclCalloc(&gpus, ngpus));
// Find localNetMask which includes net with the most local GPUs.
int netLocalGpus = 0, minType = PATH_DIS;
uint64_t localNetMask = 0ULL;
for (int g=0; g<ngpus; g++) {
int type = PATH_DIS;
uint64_t mask;
NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
if ((1ULL<<net) & mask) {
if (type < minType) {
localNetMask = mask;
netLocalGpus = 0;
minType = type;
}
if (type == minType) {
if (localNetMask && mask != localNetMask) {
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
free(gpus);
return ncclInternalError;
}
gpus[netLocalGpus] = g;
netLocalGpus++;
}
}
}
if (localNetMask == 0ULL) {
*gpuIndex = -1;
free(gpus);
return ncclSuccess;
}
int rr = system->nodes[GPU].nodes[g].gpu.dev;
*id = nets[rr%count];
free(nets);
return ncclSuccess;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while (1) {
if (1ULL << n & localNetMask) {
if (n == net) {
*gpuIndex = gpus[gIndex];
free(gpus);
return ncclSuccess;
}
gIndex++;
if (gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n+1) % 64;
}
}
/****************************/
@@ -822,20 +892,18 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) {
if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
}
}
if (cpu == NULL) {
@@ -885,6 +953,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
return ncclSuccess;
}
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[GPU].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;
@@ -910,11 +983,9 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
for (int g=0; g<system->nodes[GPU].count; g++) {
for ( int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){
if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
*localRank = g;
return ncclSuccess;
}
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
*localRank = g;
return ncclSuccess;
}
}
WARN("Could not find local GPU with rank %d", rank);
+15 -13
View File
@@ -13,12 +13,13 @@
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 22.0
#define SM80_NVLINK_BW 22.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 9.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
@@ -75,7 +76,12 @@ extern const char* topoLinkTypeStr[];
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7
#define PATH_DIS 7
// Connection through the network
#define PATH_NET 8
// Disconnected
#define PATH_DIS 9
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
@@ -106,7 +112,6 @@ struct ncclTopoLinkList {
#define RCCL_TOPO_FORCE_INTRA 16
#define RCCL_TOPO_XGMI_ALL 32
#define RCCL_TOPO_MAX_RANKS_PER_GPU 8
struct ncclTopoNode {
int type;
int64_t id;
@@ -114,8 +119,7 @@ struct ncclTopoNode {
union {
struct {
int dev; // NVML dev number
int rank[RCCL_TOPO_MAX_RANKS_PER_GPU];
int nRanksPerGpu;
int rank;
int cudaCompCap;
int gdrSupport;
int gcn;
@@ -198,11 +202,9 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) {
*index = i;
return ncclSuccess;
}
if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
*index = i;
return ncclSuccess;
}
}
return ncclInternalError;
@@ -212,7 +214,7 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
*rank = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
*rank = system->nodes[GPU].nodes[i].gpu.rank[0];
*rank = system->nodes[GPU].nodes[i].gpu.rank;
return ncclSuccess;
}
}
+121 -66
View File
@@ -54,7 +54,10 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } };
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@@ -71,18 +74,18 @@ struct tuningModel {
static struct tuningModel tuning_model_0 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -101,18 +104,18 @@ static struct tuningModel tuning_model_0 {
static struct tuningModel tuning_model_1 {
.hwLat =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.bwRatio =
{ /* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -131,18 +134,18 @@ static struct tuningModel tuning_model_1 {
static struct tuningModel tuning_model_2 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -161,18 +164,18 @@ static struct tuningModel tuning_model_2 {
static struct tuningModel tuning_model_3 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -191,18 +194,18 @@ static struct tuningModel tuning_model_3 {
static struct tuningModel tuning_model_4 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 } },
{ /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -232,21 +235,42 @@ static struct tuningModel rcclTuningModel[] = {
#define HOPPER_COMPCAP_IDX 2
// LL128 max BW per channel
static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
static const double llMaxBws[3][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
/* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}
};
static const double perChMaxRingLL128Bws[3][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
};
static const double perChMaxTreeLL128Bws[3][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
};
static const double perChMaxTreeBws[3][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
};
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
// Network post overhead in ns (1000 = 1 us)
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
static float getNetOverhead(struct ncclComm* comm) {
if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
else return 1.0;
}
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
@@ -262,7 +286,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@@ -281,11 +306,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
double llMaxBw = llMaxBws[index1][index2];
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
//if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@@ -299,10 +325,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if ((coll != ncclFuncAllReduce) && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
@@ -315,13 +344,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
else
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
#else
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -331,12 +359,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
busBw /= factor;
}
#endif
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;
// Convert bus BW to algorithm BW
float ratio;
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS) ratio = .75;
else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0;
else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
else ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio;
@@ -344,16 +373,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
//if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
if (a == NCCL_ALGO_RING) {
float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
if (ringGraph->sameChannels) {
if (graphs[a]->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
if (p == NCCL_PROTO_SIMPLE) lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
comm->latencies[coll][a][p] += nsteps*lat;
}
} else {
// Inter-node rings still have to launch nsteps * net overhead.
float netOverhead = 0.0;
if (nNodes > 1) {
netOverhead = getNetOverhead(comm);
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
}
intraLat = std::max(intraLat, netOverhead);
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
@@ -363,7 +401,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat;
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
} else if (a == NCCL_ALGO_NVLS) {
if (nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
}
}
}
@@ -372,7 +414,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) {
@@ -385,15 +427,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// Disable NVLink SHARP if not supported
if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
}
@@ -415,7 +458,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
pEnable = 1;
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
pEnable &= (minCompCap == maxCompCap);
switch (minCompCap) {
case 70: pEnable &= 1; break;
@@ -433,28 +476,38 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (comm->rank == 0) {
char line[1024];
sprintf(line, "Latency/AlgBw |");
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
for (int block=0; block<2; block++) {
sprintf(line, " Algorithm |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Protocol |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
}
}
}
@@ -514,7 +567,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
}
#endif
// Tree pipelining saves latency in aggregation cases
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
+2 -2
View File
@@ -789,8 +789,8 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
}
NCCLCHECK(ncclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
#else
nvmlDevice_t nvmlDev = NULL;
if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
nvmlDevice_t nvmlDev;
NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
#endif
return ncclSuccess;
-19
View File
@@ -178,25 +178,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
return ncclSuccess;
}
static ncclResult_t xmlSetOrAppendAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
char *tmp = strdup(node->attrs[index].value);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%s,%d", tmp, value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
free (tmp);
return ncclSuccess;
}
static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+17 -34
View File
@@ -45,13 +45,14 @@ ncclResult_t ncclAsyncLaunch(
job->undo = undo;
job->destructor = destructor;
job->abortFlag = comm->abortFlag;
job->childAbortFlag = comm->childAbortFlag;
job->state = ncclGroupJobRunning;
job->comm = comm;
/* check if there are blocking and nonblocking comms at the same time in group. */
if (ncclGroupBlocking == -1) {
/* first met communicator */
ncclGroupBlocking = comm->blocking;
} else if (ncclGroupBlocking != comm->blocking) {
ncclGroupBlocking = comm->config.blocking;
} else if (ncclGroupBlocking != comm->config.blocking) {
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
ret = ncclInvalidArgument;
}
@@ -87,23 +88,20 @@ ncclResult_t ncclGroupStart() {
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
/* if previous group launch does not complete, don't launch this one. */
if (ncclGroupJobMainPtr != NULL) {
if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
ret = ncclInvalidUsage;
goto exit;
} else {
NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit);
}
}
NCCLCHECK(ncclGroupStartInternal());
TRACE_CALL("ncclGroupStart()");
exit:
return ret;
}
ncclResult_t ncclGroupStartInternal() {
inline ncclResult_t ncclGroupStartInternal() {
/* if previous group launch does not complete, don't launch this one. */
if (ncclGroupJobMainPtr != NULL) {
if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
return ncclInvalidUsage;
} else {
NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr));
}
}
ncclGroupDepth++;
if (mscclAvailable() && !mscclIsCaller()) {
NCCLCHECK(mscclGroupStart());
@@ -204,13 +202,6 @@ failure:
return result;
}
static inline void groupResetJobState() {
ncclGroupBlocking = -1;
ncclGroupJobMainPtr = NULL;
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
return;
}
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
struct ncclComm* comm = *groupCommHeadPtr;
@@ -255,7 +246,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
}
if (!comm->blocking)
if (!comm->config.blocking)
(void) ncclCommSetAsyncError(comm, error);
comm = next;
}
@@ -264,7 +255,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
*job->abortFlag = 1;
if (job->comm && !job->comm->blocking)
if (job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, error);
if (job->undo) job->undo(job);
if (job->destructor) job->destructor((void*)job);
@@ -339,6 +330,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
*job->abortFlag = 1;
if (job->childAbortFlag) *job->childAbortFlag = 1;
}
job = job->next;
@@ -359,7 +351,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (job->comm && !job->comm->blocking)
if (job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
@@ -368,7 +360,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclComm* next = comm->groupNext;
(void) ncclGroupCommLeave(comm);
if (!comm->blocking) {
if (!comm->config.blocking) {
(void) ncclCommSetAsyncError(comm, ret);
}
groupCommHeadMain = next;
@@ -449,15 +441,6 @@ fail:
goto exit;
}
static ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
ncclResult_t ret = ncclSuccess;
if (job) {
ret = ncclAsyncJobComplete(&job->base);
groupResetJobState();
}
return ret;
}
void ncclGroupJobAbort() {
ncclGroupJobAbortFlag = true;
(void) groupJobComplete(ncclGroupJobMainPtr);
+3
View File
@@ -13,6 +13,9 @@
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_POWER(x, y) \
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
+78 -1
View File
@@ -12,6 +12,7 @@
#include "checks.h"
#include "align.h"
#include "utils.h"
#include "p2p.h"
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
@@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
#define MAX_ALLOC_TRACK_NGPU 32
extern struct allocationTracker allocTracker[];
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = currentDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
return result;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
CUmemGenericAllocationHandle handle;
size_t size = 0;
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
#else
extern int ncclCuMemEnable();
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
#endif
template <typename T>
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
ncclResult_t result = ncclSuccess;
@@ -193,8 +265,13 @@ template <typename T>
ncclResult_t ncclCudaFree(T* ptr) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaFree(ptr), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
} else {
CUDACHECKGOTO(cudaFree(ptr), result, finish);
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
+1
View File
@@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+3 -1
View File
@@ -9,7 +9,9 @@
#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
+29 -29
View File
@@ -18,11 +18,11 @@
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
#define CUDACHECKGOTO(cmd, RES, label) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
res = ncclUnhandledCudaError; \
RES = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
@@ -60,11 +60,11 @@
} \
} while(true)
#define SYSCHECKGOTO(statement, res, label) do { \
#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
@@ -72,16 +72,16 @@
#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define NEQCHECKGOTO(statement, value, res, label) do { \
#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
@@ -89,57 +89,57 @@
#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define EQCHECKGOTO(statement, value, res, label) do { \
#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return RES; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while (0);
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
ncclResult_t res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
#define NCCLCHECKTHREAD(a, args) do { \
+5 -4
View File
@@ -63,11 +63,12 @@ struct ncclDevRedOpFull {
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
#define DECL3(func, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef)
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef) \
DECL4(func, NVLS_TREE, devredop, type, undef)
#if defined(RCCL_BFLOAT16)
#define DECL2(func, devredop, undefForFloat) \
+55 -20
View File
@@ -101,19 +101,51 @@ struct ncclCommCallback {
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclSharedResources {
int refCount;
struct ncclComm* owner; /* comm which creates this shared res. */
struct ncclChannelPeer* peers[MAXCHANNELS];
struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
/* P2P operation counter, one per channel */
uint64_t p2pOpCount[MAXCHANNELS];
/* Collective operation counter */
uint64_t collOpCount;
int tpNRanks;
int tpNLocalRanks;
int tpNChannels;
int tpP2pNChannels;
int tpP2pChunkSize;
uint64_t magic;
// top parent rank to localRank translation table
int* tpRankToLocalRank;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
/* proxy related shared res */
struct ncclProxyState* proxyState;
};
struct ncclChannel {
struct ncclChannelPeer* peers;
struct ncclDevChannelPeer* devPeers;
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount;
/* comm split sharable resources */
struct ncclChannelPeer* collnetPeers;
struct ncclDevChannelPeer* collnetDevPeers;
struct ncclChannelPeer* nvlsPeers;
struct ncclDevChannelPeer* nvlsDevPeers;
};
struct ncclWorkList {
@@ -167,6 +199,10 @@ struct ncclComm {
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
struct ncclSharedResources* sharedRes;
/* map to top parent ranks. */
int* topParentRanks;
int* topParentLocalRanks;
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
@@ -180,15 +216,16 @@ struct ncclComm {
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
//int nvmlDev; // my nvml device index
int compCap; // compute capability of the GPU
int minCompCap; // min compute capability in the communicator
int minCompCap, maxCompCap; // min/max compute capability in the communicator
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
int virtualId;
int cudaArch; // matches __CUDA_ARCH__ of device
int node;
@@ -207,12 +244,11 @@ struct ncclComm {
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Collective operation counter
uint64_t collOpCount;
// Channels for collectives
int nChannels;
int nvlsChannels;
int collNetChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
@@ -237,6 +273,8 @@ struct ncclComm {
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
// Flags for enable P2P NET
uint32_t p2pNet;
@@ -268,21 +306,24 @@ struct ncclComm {
char intraPad2[64 - sizeof(uint64_t)];
uint64_t intraBarrierGate; // only used if this is intraComm0
struct ncclProxyState proxyState;
struct ncclProxyState* proxyState;
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
/* sharable collNet proxy progress resource. */
struct ncclCollNetSharedRes* collNetSharedRes;
// NVLink SHARP (NVLS) support
int nvlsSupport;
void* nvlsResources;
/* sharable NVLS resource. */
struct ncclNvlsSharedRes* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -319,13 +360,7 @@ struct ncclComm {
volatile bool collTraceExit;
#endif
// communicator mode
int blocking;
// CGA cluster size
int cgaClusterSize;
int minCTAs, maxCTAs;
// network interface name
char *netName;
ncclConfig_t config;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called
+4
View File
@@ -11,6 +11,9 @@
#include <cuda_runtime.h>
#include "checks.h"
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
#else
@@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
+15 -6
View File
@@ -21,12 +21,13 @@
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -112,10 +113,10 @@ struct ncclConnInfo {
};
struct ncclProxyConnector {
int rank;
int localRank;
int tpRank;
int tpLocalRank;
int sameProcess;
struct ncclProxyConnection* connection;
struct ncclComm* comm;
};
struct ncclConnector {
@@ -124,7 +125,6 @@ struct ncclConnector {
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
struct ncclComm *comm;
};
struct ncclRing {
@@ -141,6 +141,9 @@ struct ncclRing {
};
// The root of each tree only has one node down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
@@ -161,18 +164,24 @@ struct ncclDirect {
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_NVLS_ARITY 8
#define NCCL_MAX_NVLS_TREE_ARITY 3
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define NCCL_MAX_CONNS 3
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
int refCount;
};
struct ncclDevComm;
@@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
#endif
struct alignas(16) ncclDevChannel {
struct ncclDevChannelPeer *peers;
struct ncclDevChannelPeer** peers;
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collnetChain;
+1 -1
View File
@@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(cudaFree(md->gdrDevMem));
NCCLCHECK(ncclCudaFree(md->gdrDevMem));
free(md);
return ncclSuccess;
+9 -8
View File
@@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
#define NCCL_TOPO_CPU_TYPE_ROME 4
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
#define NCCL_TOPO_MAX_NODES 256
@@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
@@ -108,18 +111,16 @@ struct ncclTopoRanks {
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+20 -1
View File
@@ -36,6 +36,7 @@ struct ncclAsyncJob {
void(*destructor)(void*);
ncclGroupJobState_t state;
volatile uint32_t *abortFlag; /* point to comm abortFlag */
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
ncclComm_t comm;
};
@@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError;
extern __thread struct ncclComm* ncclGroupCommHead;
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
extern __thread int ncclGroupBlocking;
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
extern __thread struct ncclGroupJob ncclGroupJobMain;
static inline void groupResetJobState() {
ncclGroupBlocking = -1;
ncclGroupJobMainPtr = NULL;
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
return;
}
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
ncclResult_t ret = ncclSuccess;
if (job) {
ret = ncclAsyncJobComplete(&job->base);
groupResetJobState();
}
return ret;
}
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
if (ncclGroupDepth > 0) {
@@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
ncclMemoryStackPush(&comm->memScoped);
}
ncclGroupBlocking = comm->blocking;
ncclGroupBlocking = comm->config.blocking;
}
// Add comm to this thread's group needing preconnect
File diff suppressed because it is too large Load Diff
+44
View File
@@ -0,0 +1,44 @@
#ifndef NCCL_IBV_SYMBOLS_H_
#define NCCL_IBV_SYMBOLS_H_
#ifdef NCCL_BUILD_RDMA_CORE
#include <infiniband/verbs.h>
#else
#include "ibvcore.h"
#endif
#include "nccl.h"
/* IB Verbs Function Pointers*/
struct ncclIbvSymbols {
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
};
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
#endif // NCCL_IBV_SYMBOLS_H_
File diff suppressed because it is too large Load Diff
+3 -2
View File
@@ -26,6 +26,7 @@ typedef enum : uint8_t {
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
@@ -94,7 +95,6 @@ struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
@@ -104,7 +104,8 @@ struct ncclTasks {
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
size_t collBytesTotal;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
-19
View File
@@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetInit(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Translation to external API
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
// Test whether the current GPU support GPU Direct RDMA.
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
+21 -12
View File
@@ -126,7 +126,7 @@
* Systems:
*
* \image html
* https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
* https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
*
* Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
* ranges to your code that automatically use the name of the enclosing function
@@ -561,18 +561,27 @@
/* Temporary helper #defines, removed with #undef at end of header */
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
#if defined(_MSC_VER) && _MSC_VER < 1914
/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
* that does not work with domain::get's specialization for domain::global,
* and would require extra conditions to make SFINAE work for the overloaded
* get() functions. This macro disables use of overloaded get() in order to
* work with VS2015 and versions of VS2017 below 15.7, without penalizing
* users of newer compilers. Building with this flag set to 0 means errors
* when defining tag structs (see documentation for domain, named_category,
* and registered_string) will have more complex compiler error messages
* instead of the clear static_assert messages from the get() overloads.
/* Some compilers do not correctly support SFINAE, which is used in this API
* to detect common usage errors and provide clearer error messages (by using
* static_assert) than the compiler would produce otherwise. These compilers
* will generate errors while compiling this file such as:
*
* error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’
*
* The following compiler versions are known to have this problem, and so are
* set by default to disable the SFINAE-based checks:
*
* - All MSVC versions prior to VS2017 Update 7 (15.7)
* - GCC 8.1-8.3 (the problem was fixed in GCC 8.4)
*
* If you find your compiler hits this problem, you can work around it by
* defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this
* header, or you can add a check for your compiler version to this #if.
* Also, please report the issue on the NVTX github page.
*/
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
#if defined(_MSC_VER) && _MSC_VER < 1914 \
|| defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
#else
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
@@ -1,30 +1,33 @@
/*
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
/*
* Helper array to get the alignment for each predefined C language type.
*/
typedef void* pointer_type;
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
#include <uchar.h>
#include <stdalign.h>
#endif
/* `alignof` is available as of C11 or C++11 */
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
#define nvtx_alignof(type) alignof(type)
#define nvtx_alignof2(type,tname) alignof(type)
#else /* __STDC_VERSION__ >= 201112L */
#ifndef __cplusplus
#include <stddef.h>
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
#else /* __cplusplus */
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
/* Create helper structs to determine type alignment. */
#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
MKTYPEDEF(char);
MKTYPEDEF2(unsigned char, uchar);
@@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
MKTYPEDEF(pointer_type);
MKTYPEDEF(wchar_t);
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
/* `char8_t` is available as of C++20 or C23 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
MKTYPEDEF(char8_t);
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
/* `char16_t` and `char32_t` are available as of C++11 or C11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
MKTYPEDEF(char16_t);
MKTYPEDEF(char32_t);
#endif
/* C requires to include stddef.h to use `offsetof` */
#ifndef __cplusplus
#include <stddef.h>
#endif
#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
#endif /* __STDC_VERSION__ >= 201112L */
#undef MKTYPEDEF
#undef MKTYPEDEF2
#endif /* __cplusplus */
#endif /* __STDC_VERSION__ >= 201112L */
/*
* Helper array to get the alignment for each predefined C/C++ language type.
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
*/
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
@@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
/*** Special character types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
#else
{0, 0},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
#else
@@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
};
#undef nvtx_alignof
#undef nvtx_alignof2
#undef nvtx_alignof2
+17
View File
@@ -9,4 +9,21 @@
#ifndef NCCL_P2P_H_
#define NCCL_P2P_H_
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
typedef struct {
int data; // Currently only support an fd based descriptor
} ncclCuDesc;
typedef union {
// Legacy CUDA IPC
cudaIpcMemHandle_t devIpc;
// cuMem API support
ncclCuDesc cuDesc;
} ncclIpcDesc;
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
#endif
+34 -21
View File
@@ -15,11 +15,13 @@
#include "ipcsocket.h"
#include <pthread.h>
#include "shm.h"
#include "p2p.h"
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
enum { proxyRecv=0, proxySend=1 };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
@@ -130,18 +132,11 @@ struct ncclProxySharedP2p {
int size;
char* cudaBuff;
char* hostBuff;
cudaIpcMemHandle_t ipc;
// CUDA IPC
ncclIpcDesc ipcDesc;
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
};
struct ncclProxySharedCollNet {
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
};
struct ncclProxyPeer {
struct ncclProxySharedP2p send;
struct ncclProxySharedP2p recv;
@@ -165,7 +160,6 @@ struct ncclProxyProgressState {
bool stop;
struct ncclProxyPeer** localPeers;
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
struct ncclProxySharedCollNet collNet;
struct ncclProxyArgs* active;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
@@ -192,12 +186,27 @@ struct ncclProxyAsyncOp {
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
int tpRank;
int tpLocalRank;
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
struct ncclProxyState {
int refCount;
int tpRank;
int tpnRanks;
int tpLocalnRanks;
int cudaDev;
int p2pnChannels;
int p2pChunkSize;
int nChannels;
int buffSizes[NCCL_NUM_PROTOCOLS];
bool allocP2pNetLLBuffers;
bool dmaBufSupport;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
volatile uint32_t* abortFlag;
// Service thread
pthread_t thread;
struct ncclSocket* listenSock;
@@ -209,6 +218,7 @@ struct ncclProxyState {
struct ncclSocket* peerSocks;
struct ncclProxyOps* proxyOps;
void** sharedDevMems;
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
// Progress thread
struct ncclProxyProgressState progressState;
@@ -228,13 +238,14 @@ enum proxyConnectState {
struct ncclProxyConnection {
int send, transport, shared;
int localRank;
int tpLocalRank, sameProcess;
struct ncclSocket* sock;
struct ncclTransportComm* tcomm;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
proxyConnectState state;
struct ncclCollNetSharedRes* collNet;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
enum ncclProxyMsgType {
ncclProxyMsgInit = 1,
ncclProxyMsgSharedInit = 2,
@@ -260,22 +271,24 @@ enum ncclProxyMsgType {
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9 // cuMem API support
ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
enum { proxyRecv=0, proxySend=1 };
ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
#endif
+38 -10
View File
@@ -36,7 +36,6 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
int netDev;
int gdrSupport;
bool hasFineGrain;
uint64_t hostHash;
@@ -45,7 +44,6 @@ struct ncclPeerInfo {
int64_t busId;
struct ncclComm* comm;
int cudaCompCap;
int virtualId;
};
#define CONNECT_SIZE 128
@@ -53,15 +51,46 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
#if CUDART_VERSION >= 12010
#define NVLS_HANDLE_SIZE 64
struct ncclNvlsSharedRes {
int refCount;
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
char shareableHandle[NVLS_HANDLE_SIZE];
int nChannels;
};
#endif /* CUDART_VERSION >= 12010 */
struct ncclCollNetSharedRes {
int refCount;
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
int nChannels;
size_t buffSize;
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(struct ncclConnector*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
};
struct ncclTransport {
@@ -74,10 +103,9 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
#if CUDART_VERSION >= 12010
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
#endif
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
+620 -317
View File
File diff suppressed because it is too large Load Diff
+42 -3
View File
@@ -6,10 +6,46 @@
#include "nccl.h"
#include "debug.h"
#include "param.h"
#include "cudawrap.h"
#include <dlfcn.h>
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
static int ncclCuMemSupported = 0;
// Determine whether CUMEM & VMM RDMA is supported on this platform
int ncclIsCuMemSupported() {
#if CUDART_VERSION < 11030
return 0;
#else
CUdevice currentDev;
int cudaDev;
int cudaDriverVersion;
int flag = 0;
ncclResult_t ret = ncclSuccess;
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
if (CUPFN(cuMemCreate) == NULL) return 0;
CUCHECKGOTO(cuDeviceGet(&currentDev, cudaDev), ret, error);
// Query device to see if CUMEM VMM support is available
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
if (!flag) return 0;
// Query device to see if CUMEM RDMA support is available
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
if (!flag) return 0;
error:
return (ret == ncclSuccess);
#endif
}
int ncclCuMemEnable() {
return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
}
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
@@ -35,6 +71,7 @@ DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemMap, 10020);
DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
@@ -89,7 +126,6 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
LOAD_SYM(cuCtxGetDevice, 2000, 1);
/* cuMem API support */
#if CUDA_VERSION >= 11030
LOAD_SYM(cuMemAddressReserve, 10020, 1);
LOAD_SYM(cuMemAddressFree, 10020, 1);
LOAD_SYM(cuMemCreate, 10020, 1);
@@ -98,9 +134,9 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
LOAD_SYM(cuMemMap, 10020, 1);
LOAD_SYM(cuMemRelease, 10020, 1);
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
#endif
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
#endif
@@ -135,7 +171,7 @@ static void initOnceFunc() {
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libcuda.so");
else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
(void) dlerror(); // Clear any previous errors
cudaLib = dlopen(path, RTLD_LAZY);
@@ -195,6 +231,9 @@ static void initOnceFunc() {
}
#endif
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
initResult = ncclSuccess;
return;
error:
+158
View File
@@ -0,0 +1,158 @@
#include <sys/types.h>
#include <unistd.h>
#include "ibvsymbols.h"
#ifdef NCCL_BUILD_RDMA_CORE
/* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */
#define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
// Passthrough function for ibv_reg_mr macro in verbs.h
struct ibv_mr* ibv_internal_reg_mr(
struct ibv_pd* pd,
void* addr,
size_t length,
int access) {
return ibv_reg_mr(pd, addr, length, access);
}
// Passthrough function for ibv_internal_query_port macro in verbs.h
int ibv_internal_query_port(
struct ibv_context* context,
uint8_t port_num,
struct ibv_port_attr* port_attr) {
return ibv_query_port(context, port_num, port_attr);
}
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list);
ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list);
ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name);
ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device);
ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device);
ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event);
ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event);
ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device);
ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid);
ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp);
ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd);
ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd);
ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2);
ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr);
ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr);
ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq);
ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq);
ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp);
ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp);
ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
return ncclSuccess;
}
#else
/* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */
#include <dlfcn.h>
#include "core.h"
// IBVERBS Library versioning
#define IBVERBS_VERSION "IBVERBS_1.1"
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
static void* ibvhandle = NULL;
void* tmp;
void** cast;
ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
if (!ibvhandle) {
ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
if (!ibvhandle) {
INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
goto teardown;
}
}
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \
if (tmp == NULL) { \
WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \
goto teardown; \
} \
*cast = tmp; \
} while (0)
// Attempt to load a specific symbol version - fail silently
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
cast = (void**)&funcptr; \
*cast = dlvsym(handle, symbol, version); \
} while (0)
LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list);
LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list);
LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name);
LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device);
LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device);
LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event);
LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event);
LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device);
LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port);
LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid);
LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp);
LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd);
LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd);
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr);
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq);
LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp);
LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp);
LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp);
LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);
return ncclSuccess;
teardown:
ibvSymbols->ibv_internal_get_device_list = NULL;
ibvSymbols->ibv_internal_free_device_list = NULL;
ibvSymbols->ibv_internal_get_device_name = NULL;
ibvSymbols->ibv_internal_open_device = NULL;
ibvSymbols->ibv_internal_close_device = NULL;
ibvSymbols->ibv_internal_get_async_event = NULL;
ibvSymbols->ibv_internal_ack_async_event = NULL;
ibvSymbols->ibv_internal_query_device = NULL;
ibvSymbols->ibv_internal_query_port = NULL;
ibvSymbols->ibv_internal_query_gid = NULL;
ibvSymbols->ibv_internal_query_qp = NULL;
ibvSymbols->ibv_internal_alloc_pd = NULL;
ibvSymbols->ibv_internal_dealloc_pd = NULL;
ibvSymbols->ibv_internal_reg_mr = NULL;
ibvSymbols->ibv_internal_reg_mr_iova2 = NULL;
ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL;
ibvSymbols->ibv_internal_dereg_mr = NULL;
ibvSymbols->ibv_internal_create_cq = NULL;
ibvSymbols->ibv_internal_destroy_cq = NULL;
ibvSymbols->ibv_internal_create_qp = NULL;
ibvSymbols->ibv_internal_modify_qp = NULL;
ibvSymbols->ibv_internal_destroy_qp = NULL;
ibvSymbols->ibv_internal_fork_init = NULL;
ibvSymbols->ibv_internal_event_type_str = NULL;
if (ibvhandle != NULL) dlclose(ibvhandle);
return ncclSystemError;
}
#endif
+54 -182
View File
@@ -8,314 +8,186 @@
#include <sys/types.h>
#include <unistd.h>
#include <dlfcn.h>
#include "core.h"
/*Function Pointers*/
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
// IBVERBS Library versioning
#define IBVERBS_VERSION "IBVERBS_1.1"
#include "ibvsymbols.h"
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static ncclResult_t initResult;
static void initOnceFunc(void) {
static void* ibvhandle = NULL;
void* tmp;
void** cast;
ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
if (!ibvhandle) {
ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
if (!ibvhandle) {
INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
goto teardown;
}
}
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \
if (tmp == NULL) { \
WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \
goto teardown; \
} \
*cast = tmp; \
} while (0)
// Attempt to load a specific symbol version - fail silently
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
cast = (void**)&funcptr; \
*cast = dlvsym(handle, symbol, version); \
} while (0)
LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device);
LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device);
LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event);
LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event);
LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device);
LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port);
LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid);
LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp);
LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp);
LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp);
LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp);
LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
initResult = ncclSuccess;
return;
teardown:
ibv_internal_get_device_list = NULL;
ibv_internal_free_device_list = NULL;
ibv_internal_get_device_name = NULL;
ibv_internal_open_device = NULL;
ibv_internal_close_device = NULL;
ibv_internal_get_async_event = NULL;
ibv_internal_ack_async_event = NULL;
ibv_internal_query_device = NULL;
ibv_internal_query_port = NULL;
ibv_internal_query_gid = NULL;
ibv_internal_query_qp = NULL;
ibv_internal_alloc_pd = NULL;
ibv_internal_dealloc_pd = NULL;
ibv_internal_reg_mr = NULL;
ibv_internal_reg_mr_iova2 = NULL;
ibv_internal_reg_dmabuf_mr = NULL;
ibv_internal_dereg_mr = NULL;
ibv_internal_create_cq = NULL;
ibv_internal_destroy_cq = NULL;
ibv_internal_create_qp = NULL;
ibv_internal_modify_qp = NULL;
ibv_internal_destroy_qp = NULL;
ibv_internal_fork_init = NULL;
ibv_internal_event_type_str = NULL;
if (ibvhandle != NULL) dlclose(ibvhandle);
initResult = ncclSystemError;
return;
}
struct ncclIbvSymbols ibvSymbols;
ncclResult_t wrap_ibv_symbols(void) {
pthread_once(&initOnceControl, initOnceFunc);
pthread_once(&initOnceControl,
[](){ initResult = buildIbvSymbols(&ibvSymbols); });
return initResult;
}
#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
if (name_internal == NULL) { \
/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
#define CHECK_NOT_NULL(container, internal_name) \
if (container.internal_name == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
retval = call; \
}
#define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
retval = container.call; \
if (retval == error_retval) { \
WARN("Call to " name " failed with error %s", strerror(errno)); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
retval = call; \
#define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
retval = container.call; \
if (retval == error_retval) { \
WARN("Call to " name " failed"); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
int ret = call; \
#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
int ret = container.call; \
if (ret != success_retval) { \
WARN("Call to " name " failed with error %s", strerror(ret)); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_INT_CHECK(name_internal, call, error_retval, name) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
int ret = call; \
#define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \
CHECK_NOT_NULL(container, internal_name); \
int ret = container.call; \
if (ret == error_retval) { \
WARN("Call to " name " failed"); \
return ncclSystemError; \
} \
return ncclSuccess;
#define IBV_PASSTHRU(name_internal, call) \
if (name_internal == NULL) { \
WARN("lib wrapper not initialized."); \
return ncclInternalError; \
} \
call; \
#define IBV_PASSTHRU(container, internal_name, call) \
CHECK_NOT_NULL(container, internal_name); \
container.call; \
return ncclSuccess;
ncclResult_t wrap_ibv_fork_init() {
IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
}
ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
*ret = ibv_internal_get_device_list(num_devices);
*ret = ibvSymbols.ibv_internal_get_device_list(num_devices);
if (*ret == NULL) *num_devices = 0;
return ncclSuccess;
}
ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list));
IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list));
}
const char *wrap_ibv_get_device_name(struct ibv_device *device) {
if (ibv_internal_get_device_name == NULL) {
if (ibvSymbols.ibv_internal_get_device_name == NULL) {
WARN("lib wrapper not initialized.");
exit(-1);
}
return ibv_internal_get_device_name(device);
return ibvSymbols.ibv_internal_get_device_name(device);
}
ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
}
ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
}
ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
}
ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
}
ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
}
ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
}
ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
}
ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
}
ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
}
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
}
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
if (ibv_internal_reg_mr == NULL) {
if (ibvSymbols.ibv_internal_reg_mr == NULL) {
WARN("lib wrapper not initialized.");
return NULL;
}
return ibv_internal_reg_mr(pd, addr, length, access);
return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access);
}
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
if (ibv_internal_reg_mr_iova2 == NULL) {
if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) {
return ncclInternalError;
}
if (ret == NULL) { return ncclSuccess; } // Assume dummy call
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
}
/* DMA-BUF support */
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
if (ibv_internal_reg_dmabuf_mr == NULL) {
if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) {
errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
return NULL;
}
return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
}
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
}
ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
}
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
}
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
}
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
}
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
}
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
*ret = (char *) ibv_internal_event_type_str(event);
*ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
return ncclSuccess;
}
+4 -4
View File
@@ -106,7 +106,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
proxyOp.pattern = 0;
proxyOp.root = 0;
proxyOp.nbytes = status.stepSize*proxyOp.sliceSteps;
proxyOp.opCount = comm->collOpCount;
proxyOp.opCount = comm->sharedRes->collOpCount;
int nLoops = (int)(DIVUP(status.nBytes, (size_t)((size_t)hostAlgo->nChunksPerLoop*(size_t)status.chunkEffectiveSize)));
int nLoopsChunkSteps = nLoops * status.chunkSteps;
for (int ch = 0; ch < hostAlgo->nChannels; ch++) {
@@ -123,7 +123,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
}
proxyOp.nsteps = nLoopsChunkSteps * nRecvs;
if (proxyOp.nsteps > 0) {
NCCLCHECK(mscclSaveProxy(ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0));
NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0));
}
}
for (int i=0; i<mscclChannel->nSendPeers; i++){
@@ -136,12 +136,12 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
}
proxyOp.nsteps = nLoopsChunkSteps * nSends;
if (proxyOp.nsteps > 0) {
NCCLCHECK(mscclSaveProxy(ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0));
NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0));
}
}
}
NCCLCHECK(ncclProxyStart(comm));
comm->collOpCount++;
comm->sharedRes->collOpCount++;
return ncclSuccess;
}
+3 -1
View File
@@ -170,4 +170,6 @@ error:
return ncclSystemError;
}
int ncclCuMemEnable() {
return 0;
}
+6 -8
View File
@@ -14,6 +14,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <utils.h>
struct shmHandleInternal {
int fd;
@@ -31,7 +32,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
handle->devShmPtr = dptr;
handle->shmSize = shmSize;
handle->realShmSize = realShmSize;
handle->refcount = (int*)(hptr + shmSize);
handle->refcount = (hptr != NULL) ? (int*)(hptr + shmSize) : NULL;
if (create) {
int slen = strlen(shmPath);
handle->shmPath = (char*)malloc(slen + 1);
@@ -80,23 +81,20 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
if (hptr == MAP_FAILED) {
WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
ret = ncclSystemError;
hptr = NULL;
goto fail;
}
if (create) {
*(int*)(hptr + shmSize) = refcount;
} else {
int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
if (remref == 0) {
/* the last peer has completed attachment, it should unlink the shm mem file. */
if (unlink(shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
}
}
if (refcount != -1) {
WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
}
}
if (devShmPtr) {
@@ -128,13 +126,13 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
if (tmphandle) {
if (tmphandle->fd >= 0) {
close(tmphandle->fd);
if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) {
if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
if (unlink(tmphandle->shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
ret = ncclSystemError;
}
free(tmphandle->shmPath);
}
free(tmphandle->shmPath);
}
if (tmphandle->shmPtr) {
+4 -1
View File
@@ -419,7 +419,7 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
if (sock->fd != -1) {
sock->state = ncclSocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
WARN("socketTryAccept: Accept failed: %s", strerror(errno));
return ncclSystemError;
}
return ncclSuccess;
@@ -429,6 +429,9 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
uint64_t magic;
enum ncclSocketType type;
int received = 0;
const int one = 1;
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
if (received == 0) return ncclSuccess;
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+17 -24
View File
@@ -23,7 +23,6 @@
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#define RCCL_MULTIRANKPERGPU 1
#ifdef __cplusplus
extern "C" {
@@ -50,6 +49,7 @@ typedef enum { ncclSuccess = 0,
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1
/* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */
@@ -64,6 +64,7 @@ typedef struct ncclConfig_v21700 {
int minCTAs;
int maxCTAs;
const char *netName;
int splitShare;
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
@@ -76,7 +77,8 @@ typedef struct ncclConfig_v21700 {
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR /* netName */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
}
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
@@ -131,28 +133,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
/// @endcond
/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device.
@details
rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a HIP device, which has to be set before calling
ncclCommInitRankMulti.
Since this version of the function allows multiple ranks to utilize the same
HIP device, a unique virtualId per device has to be provided by each calling
rank.
ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@param[in]
comm ncclComm_t*
communicator struct pointer
*/
ncclResult_t ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
/// @cond include_hidden
ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
/// @endcond
/*! @brief Creates a clique of communicators (single process version).
*
* @details This is a convenience function to create a single-process communicator clique.
@@ -191,6 +171,19 @@ ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/// @endcond
/*! @brief Creates one or more communicators from an existing one.
* Ranks with the same color will end up in the same communicator.
* Within the new communicator, key will be used to order ranks.
* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
* and will therefore return a NULL communicator.
* If config is NULL, the new communicator will inherit the original communicator's
* configuration*/
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/// @cond include_hidden
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/// @endcond
/* Returns a string for each error code. */
/*! @brief Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
/// @cond include_hidden
+33 -27
View File
@@ -265,10 +265,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network
char* netName;
const char* netName;
bool ok = false;
netName = comm->netName;
netName = comm->config.netName;
for (int i=0; i<3; i++) {
if (ncclNets[i] == nullptr) continue;
enum ncclNetState state;
@@ -309,27 +309,31 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
return ncclSuccess;
}
#endif
int netDevs;
NCCLCHECK(ncclNetDevices(comm, &netDevs));
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
static int gdrSupportMatrix[32] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
if (gdrSupportMatrix[comm->cudaDev] == -1) {
int netDevs;
NCCLCHECK(comm->ncclNet->devices(&netDevs));
gdrSupportMatrix[comm->cudaDev] = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
*gdrSupport = 1;
break;
gdrSupportMatrix[comm->cudaDev] = 1;
break;
#endif
// Allocate memory on the GPU and try to register it on the NIC.
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
ncclNetHandle_t handle;
void* gpuPtr = NULL;
char* gpuPtr = NULL;
void* mHandle = NULL;
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
bool connected;
connected = false;
@@ -341,32 +345,34 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
}
if (sComm == NULL)
NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
if (rComm == NULL)
NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
connected = (rComm != NULL) && (sComm != NULL);
}
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
*gdrSupport = 1;
NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
gdrSupportMatrix[comm->cudaDev] = 1;
}
ncclDebugNoWarn = 0;
CUDACHECK(cudaFree(gpuPtr));
NCCLCHECK(ncclCudaFree(gpuPtr));
cleanup2:
if (rComm != NULL)
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
NCCLCHECK(comm->ncclNet->closeRecv(rComm));
if (sComm != NULL)
NCCLCHECK(ncclNetCloseSend(comm, sComm));
NCCLCHECK(ncclNetCloseListen(comm, lComm));
NCCLCHECK(comm->ncclNet->closeSend(sComm));
NCCLCHECK(comm->ncclNet->closeListen(lComm));
cleanup1:
break;
break;
}
}
*gdrSupport = gdrSupportMatrix[comm->cudaDev];
return ncclSuccess;
}
+369 -300
View File
File diff suppressed because it is too large Load Diff
+35 -24
View File
@@ -22,8 +22,8 @@ template <int type>
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
comm->channels[channelId].peers[peer].recv + connIndex;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex :
comm->channels[channelId].peers[peer]->recv + connIndex;
// handle intra-node network connections
int n1 = -1, n2 = -1;
if (connIndex == NCCL_CONN_IDX_P2P_NET) {
@@ -57,12 +57,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n
uint64_t mask = 1UL << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue;
comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue;
comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
return ncclSuccess;
@@ -85,7 +85,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@@ -154,13 +154,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<MAXCHANNELS; c++) {
TIME_START(3);
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
struct ncclDevChannelPeer* addr;
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
@@ -171,13 +174,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// Start with recv channels
TIME_START(4);
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
struct ncclDevChannelPeer* addr;
conn->connected = 1;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
@@ -203,8 +209,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect");
exit:
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
return ret;
fail:
goto exit;
@@ -238,7 +244,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// select
struct ncclChannelPeer* root = channel->peers+nranks;
struct ncclChannelPeer* root = channel->peers[nranks];
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
@@ -277,8 +283,9 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
// connect
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
struct ncclDevChannelPeer* devRoot;
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
@@ -317,16 +324,20 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
struct ncclChannelPeer* peer = channel->peers[comm->nRanks];
if (peer) {
if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
}
}
}
}
return ncclSuccess;
+104 -95
View File
@@ -144,24 +144,26 @@ struct setupReq {
int netDev;
int useGdr;
int needFlush;
struct ncclCollNetSharedRes* collNet;
};
/* Setup send connector, and return connect information for others in the coll
* communicator to connect to me */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req;
struct setupReq req = { 0 };
int proxyRank;
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -169,17 +171,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
}
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req;
struct setupReq req = { 0 };
int proxyRank;
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -224,7 +231,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@@ -250,7 +257,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@@ -279,7 +286,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*)reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@@ -291,9 +298,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
resources->netDev = req->netDev;
resources->useGdr = req->useGdr;
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
connection->collNet = req->collNet;
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
return ncclSuccess;
}
@@ -303,19 +311,19 @@ struct sharedResources {
int commRefCount[NCCL_MAX_NETDEVS];
};
ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
if (resources == NULL) {
NCCLCHECK(ncclCalloc(&resources, 1));
comm->proxyState.progressState.collNet.resources = resources;
collNet->resources = resources;
}
if (resources->collNetComms[netDev] == NULL)
NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev));
return ncclSuccess;
}
static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
if (resources->collNetComms[netDev] == NULL) {
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
@@ -324,13 +332,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank,
resources->collNetListenComms[netDev],
resources->collNetComms+netDev);
free(handlePtrs);
if (ret == ncclSuccess) {
// Close listen comm
NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev]));
} else {
resources->collNetListenComms[netDev] = NULL;
}
@@ -340,55 +348,53 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
return ncclSuccess;
}
static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) {
struct sharedResources* resources = (struct sharedResources*)collNet->resources;
resources->commRefCount[netDev]--;
if (resources->commRefCount[netDev] == 0) {
NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev]));
}
for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
comm->proxyState.progressState.collNet.resources = NULL;
collNet->resources = NULL;
free(resources);
return ncclSuccess;
}
static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
if (state->size == 0) {
state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
if (collNet->size == 0) {
collNet->size = 2 * collNet->nChannels * collNet->buffSize;
}
*size = state->size;
*size = collNet->size;
if (cuda && state->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, comm->sideStream, cuda));
if (cuda && collNet->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda));
}
if (!cuda && state->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
if (!cuda && collNet->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
}
*gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
*gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff;
return ncclSuccess;
}
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) {
// Use different pools for different channels and also separate send/recv.
int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
int slotSize = collNet->buffSize / NCCL_STEPS;
int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel;
*offset = slotSize * globalSlot;
return ncclSuccess;
}
static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
if (state->size == 0) return ncclSuccess;
CUDACHECK(cudaFree(state->cudaBuff));
NCCLCHECK(ncclCudaHostFree(state->hostBuff));
static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) {
if (collNet->size == 0) return ncclSuccess;
NCCLCHECK(ncclCudaFree(collNet->cudaBuff));
NCCLCHECK(ncclCudaHostFree(collNet->hostBuff));
// This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
state->size = 0;
collNet->size = 0;
return ncclSuccess;
}
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*)reqBuff;
if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
@@ -401,18 +407,19 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
resources->useGdr = req->useGdr;
resources->needFlush = req->needFlush;
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
connection->collNet = req->collNet;
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle));
return ncclSuccess;
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@@ -426,7 +433,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
resources->recvMhandles[p] = info->mhandles[p];
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@@ -434,7 +441,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
}
connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev;
struct connectMap* map = &resources->map;
@@ -445,7 +452,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
resources->gdcSync = cpuPtr;
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -462,7 +469,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
// Allocate & Register shared buffers for the Simple protocol
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
#if CUDA_VERSION >= 11070
@@ -470,23 +477,23 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (resources->useGdr && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
}
*((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
}
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
@@ -494,7 +501,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
resources->collNetRank = args->rank;
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@@ -502,7 +509,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
}
connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1;
struct connectMap* map = &resources->map;
@@ -513,7 +520,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
if (ncclParamGdrCopySyncEnable()) {
resources->gdcSync = cpuPtr;
@@ -531,7 +538,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
// Allocate & Register shared buffers for the Simple protocol
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
#if CUDA_VERSION >= 11070
@@ -539,16 +546,16 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (resources->useGdr && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
}
// Pass info to send side
@@ -561,41 +568,43 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (resources) {
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
if (resources->sendMhandles[p]) {
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
NCCLCHECK(sharedBuffersDestroy(comm));
NCCLCHECK(sharedFree(comm, resources->netDev));
NCCLCHECK(sharedBuffersDestroy(connection->collNet));
NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
free(connection->transportResources);
}
return ncclSuccess;
}
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
if (resources) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->mhandles[p]) {
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
NCCLCHECK(sharedBuffersDestroy(comm));
NCCLCHECK(sharedFree(comm, resources->netDev));
NCCLCHECK(sharedBuffersDestroy(connection->collNet));
NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
free(connection->transportResources);
}
return ncclSuccess;
@@ -605,7 +614,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
#define LAST_OF_GROUP(s) \
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -633,7 +642,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
__sync_synchronize();
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -654,7 +663,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int ready = 1;
if (s == 0) {
int offset;
NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
args->sharedBuff[sharedBuffSlot] = localBuff + offset;
args->sharedSize[sharedBuffSlot] = args->chunkSize;
}
@@ -680,7 +689,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] == NULL) continue;
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@@ -696,7 +705,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int done, size;
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@@ -720,7 +729,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
return ncclSuccess;
}
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -751,7 +760,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sharedBuffSlot = sub->posted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
reqFifo[group][buffSlot].recvBuff = localBuff + offset;
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
sub->posted += args->sliceSteps;
@@ -782,8 +791,8 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
} else {
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
}
} else {
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -797,7 +806,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
int done = 1;
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -811,7 +820,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
__sync_synchronize();
+228 -189
View File
@@ -13,6 +13,7 @@
#include "collectives.h"
#include "gdrwrap.h"
#include "shm.h"
#include "p2p.h"
#include "profiler.h"
#include "graph.h"
#include "graph/topo.h"
@@ -67,10 +68,8 @@ struct connectMapMem{
char* gpuPtr;
char* cpuPtr;
int size;
union {
char shmPath[PATH_MAX];
cudaIpcMemHandle_t ipc;
};
ncclIpcDesc ipcDesc;
char shmPath[PATH_MAX];
ncclShmHandle_t attachHandle;
ncclShmHandle_t createHandle;
};
@@ -95,9 +94,9 @@ struct sendResources {
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
int rank;
int localRank;
int remoteRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int netDev;
int useGdr;
int useDmaBuf;
@@ -122,10 +121,10 @@ struct recvResources {
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
int rank;
int localRank;
int remoteRank;
int proxyRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int tpRemoteProxyRank;
int netDev;
int useGdr;
int useDmaBuf;
@@ -162,9 +161,9 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
struct setupReq {
int rank;
int localRank;
int remoteRank;
int tpRank;
int tpLocalRank;
int tpRemoteRank;
int shared;
int netDev;
int useGdr;
@@ -177,7 +176,8 @@ struct setupReq {
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req;
struct setupReq req = { 0 };
int localRank, tpProxyRank;
send->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
@@ -195,20 +195,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
send->conn.curr_hdp_reg = req.curr_hdp_reg;
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
tpProxyRank = comm->topParentRanks[proxyRank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
} else {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
}
*((int*)connectInfo) = proxyRank;
*((int*)connectInfo) = tpProxyRank;
return ncclSuccess;
}
@@ -219,7 +221,8 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
/* Setup recv connector */
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req;
struct setupReq req = { 0 };
int localRank;
recv->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
@@ -227,7 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.netDev = -1;
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank = myInfo->rank;
int proxyRank = myInfo->rank, tpProxyRank;
if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev));
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
@@ -236,13 +239,15 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
// We don't support PXN on receive yet
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
req.tpLocalRank = comm->topParentLocalRanks[localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -297,39 +302,47 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
send->transportResources = map;
opId = send;
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
} else {
opId = send;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
if (map->sameProcess) {
if (map->sameProcess && !ncclCuMemEnable()) {
if (map->cudaDev != comm->cudaDev) {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
if (!ncclCuMemEnable()) {
// Enable P2P access for Legacy IPC
cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
}
}
} else {
NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_DEVMEM].size,
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
}
if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
if (*sharedDevMemPtr == NULL) {
CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
sharedDevMemPtr));
}
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
@@ -363,13 +376,13 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
opId = recv;
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
opId, &recv->proxyConn, connectInfo);
NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
} else {
opId = recv;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
@@ -394,10 +407,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
static ncclResult_t sendFree(struct ncclConnector* send) {
struct connectMap* map = (struct connectMap*)(send->transportResources);
if (map) {
if (map->sameProcess == 0) {
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
if (map->sameProcess && map->cudaDev == cudaDev) {
// Our own GPU, so it wasn't mapped in
free(map);
return ncclSuccess;
}
if (!map->sameProcess || ncclCuMemEnable()) {
if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
}
free(map);
@@ -412,86 +439,87 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
}
#define NCCL_SHARED_STEPS 16
static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) {
static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess,
int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) {
if (cuda == 0 && sameProcess == 0) {
WARN("PXN should not use host buffers for data");
return ncclInternalError;
}
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
if (localPeers[tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1));
}
struct ncclProxyPeer* peer = localPeers[localRank];
struct ncclProxyPeer* peer = localPeers[tpLocalRank];
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
state->refcount++;
if (state->size == 0) {
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize;
}
if (size) *size = state->size;
if (cuda && state->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, comm->sideStream, cuda));
if (sameProcess == 0) {
CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff));
if (sameProcess == 0 || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
} else {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda));
}
}
if (!cuda && state->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
}
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
if (sameProcess) {
if (gpuPtr) *gpuPtr = *cpuPtr;
} else {
if (gpuPtr) *gpuPtr = NULL;
if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t));
}
if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
return ncclSuccess;
}
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
// Use different pools for different channels and also separate send/recv.
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
*offset = comm->p2pChunkSize * globalSlot;
*offset = proxyState->p2pChunkSize * globalSlot;
return ncclSuccess;
}
static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
if (peer == NULL) NCCLCHECK(ncclInternalError;)
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
if (state->size == 0) NCCLCHECK(ncclInternalError);
state->refcount--;
if (state->refcount == 0) {
if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff));
if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
if (state->cudaBuff) {
if (!connection->sameProcess || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc));
}
NCCLCHECK(ncclCudaFree(state->cudaBuff));
}
if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
}
if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
free(peer);
comm->proxyState.progressState.localPeers[localRank] = NULL;
for (int r=0; r<comm->localRanks; r++) {
if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
proxyState->progressState.localPeers[tpLocalRank] = NULL;
for (int r = 0; r < proxyState->tpLocalnRanks; r++) {
if (proxyState->progressState.localPeers[r]) return ncclSuccess;
}
// All peers are freed, free array
free(comm->proxyState.progressState.localPeers);
comm->proxyState.progressState.localPeers = NULL;
free(proxyState->progressState.localPeers);
proxyState->progressState.localPeers = NULL;
return ncclSuccess;
}
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
int rank = comm->localRankToRank[connection->localRank];
int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) {
NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL));
return ncclSuccess;
}
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*) reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@@ -499,9 +527,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
NCCLCHECK(ncclCalloc(&resources, 1));
connection->transportResources = resources;
resources->rank = req->rank;
resources->localRank = req->localRank;
resources->remoteRank = req->remoteRank;
resources->tpRank = req->tpRank;
resources->tpLocalRank = req->tpLocalRank;
resources->tpRemoteRank = req->tpRemoteRank;
resources->netDev = req->netDev;
resources->shared = connection->shared = req->shared;
resources->useGdr = req->useGdr;
@@ -509,9 +537,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
resources->connIndex = req->connIndex;
resources->curr_hdp_reg = req->curr_hdp_reg;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->maxRecvs = props.maxRecvs;
// We don't return any data
@@ -520,7 +548,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
return ncclSuccess;
}
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct setupReq* req = (struct setupReq*) reqBuff;
if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
@@ -528,9 +556,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
NCCLCHECK(ncclCalloc(&resources, 1));
connection->transportResources = resources;
resources->rank = req->rank;
resources->localRank = req->localRank;
resources->remoteRank = req->remoteRank;
resources->tpRank = req->tpRank;
resources->tpLocalRank = req->tpLocalRank;
resources->tpRemoteRank = req->tpRemoteRank;
resources->netDev = req->netDev;
resources->shared = connection->shared = req->shared;
resources->useGdr = req->useGdr;
@@ -538,50 +566,50 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
resources->channelId = req->channelId;
resources->connIndex = req->connIndex;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
/* DMA-BUF support */
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
resources->maxRecvs = props.maxRecvs;
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
*done = 1;
return ncclSuccess;
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
ncclResult_t ret = ncclSuccess;
if (resources->shared) {
// Shared buffers
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[resources->localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
if (localPeers[resources->tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
}
connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId;
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
// Connect or reuse connection for a netdev/remote rank.
if (progressState->netComms[resources->netDev] == NULL) {
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId);
resources->netSendComm = comms->sendComm[resources->channelId];
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
} else {
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
}
} else {
// Connect to remote peer
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
@@ -594,28 +622,27 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
// Create structures
struct connectMap* map = &resources->map;
map->sameProcess =
comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
map->sameProcess = connection->sameProcess;
map->shared = resources->shared;
CUDACHECK(cudaGetDevice(&map->cudaDev));
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
resources->buffSizes[p] = comm->buffSizes[p];
NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
resources->buffSizes[p] = proxyState->buffSizes[p];
}
} else {
// Get shared buffers
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(
comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc));
resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
if (comm->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
if (proxyState->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
}
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@@ -626,15 +653,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
if (!map->sameProcess) {
if (!map->sameProcess || ncclCuMemEnable()) {
ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
}
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr));
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
if (!map->sameProcess) {
CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
if (map->sameProcess) {
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
@@ -644,7 +671,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
}
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
resources->gdcSync = cpuPtr;
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -669,24 +696,24 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#else
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
int dmabuf_fd;
uint64_t offset;
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@@ -697,40 +724,40 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
resources->proxyRank = *(int*)reqBuff;
resources->tpRemoteProxyRank = *(int*)reqBuff;
ncclResult_t ret = ncclSuccess;
// Finish connection establishment from remote peer
if (resources->shared) {
// Shared buffers
struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
struct ncclProxyProgressState* progressState = &proxyState->progressState;
if (progressState->localPeers == NULL) {
NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
}
struct ncclProxyPeer** localPeers = progressState->localPeers;
if (localPeers[resources->localRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
if (localPeers[resources->tpLocalRank] == NULL) {
NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
}
connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId;
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
// Connect or reuse connection for a netdev/remote rank.
if (progressState->netComms[resources->netDev] == NULL) {
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId);
resources->netRecvComm = comms->recvComm[resources->channelId];
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
} else {
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
}
} else {
// Connect to remote peer
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
@@ -741,26 +768,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
}
*done = 1;
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm));
// Create structures
struct connectMap* map = &resources->map;
map->sameProcess =
comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
map->sameProcess = connection->sameProcess;
if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
map->shared = resources->shared;
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
resources->buffSizes[p] = comm->buffSizes[p];
NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
resources->buffSizes[p] = proxyState->buffSizes[p];
}
} else {
// Get shared buffers
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(
comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@@ -769,14 +795,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
if (comm->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
if (proxyState->allocP2pNetLLBuffers) {
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
}
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr));
if (ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
}
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
}
@@ -784,7 +815,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy && map->sameProcess) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
if (ncclParamGdrCopySyncEnable()) {
resources->gdcSync = cpuPtr;
@@ -807,24 +838,24 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#else
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
int dmabuf_fd;
uint64_t offset;
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@@ -835,17 +866,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection));
return ncclSuccess;
}
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@@ -854,19 +885,25 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
}
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
// cuMem API support
if (mems[NCCL_NET_MAP_DEVMEM].size) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
}
}
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank;
comms->sendRefCount[resources->channelId]--;
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
}
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
}
}
@@ -874,37 +911,43 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
return ncclSuccess;
}
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection));
return ncclSuccess;
}
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
// cuMem API support
if (mems[NCCL_NET_MAP_DEVMEM].size) {
NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
}
}
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank;
comms->recvRefCount[resources->channelId]--;
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
}
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
}
}
if (resources) free(resources);
return ncclSuccess;
}
@@ -915,12 +958,10 @@ static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to c
static int g_npkit_net_poll_cnt = 0;
#endif
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt++;
#endif
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -952,7 +993,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (resources->shared) {
int sharedBuffSlot = sub->posted%maxDepth;
int offset;
NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
resources->recvMem->offsFifo[buffSlot] = offset;
__sync_synchronize();
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -1010,7 +1051,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
*resources->curr_hdp_reg = 1;
}
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
@@ -1044,7 +1085,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (sub->done < sub->transmitted) {
int done;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
if (done) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
@@ -1086,12 +1127,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
return ncclSuccess;
}
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt++;
#endif
if (args->state == ncclProxyOpReady) {
// Initialize subs and group them by same recvComm.
void* recvComm;
@@ -1151,7 +1190,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (p == NCCL_PROTO_SIMPLE && resources->shared) {
int sharedBuffSlot = sub->posted%maxDepth;
int offset;
NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
offsFifo[buffSlot] = offset;
ptrs[subCount] = localBuff+offset;
@@ -1160,7 +1199,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
sizes[subCount] = stepSize*args->sliceSteps;
if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
tags[subCount] = resources->remoteRank;
tags[subCount] = resources->tpRemoteRank;
mhandles[subCount] = resources->mhandles[p];
subCount++;
}
@@ -1169,7 +1208,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->posted;
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
if (*requestPtr) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup+i;
@@ -1207,7 +1246,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sizes[NCCL_PROXY_MAX_SUBS];
void* mhandles[NCCL_PROXY_MAX_SUBS];
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
NCCLCHECK(proxyState->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes));
if (done) {
int needFlush = 0;
int totalSize = 0;
@@ -1264,7 +1303,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
}
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
}
}
args->idle = 0;
@@ -1279,7 +1318,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->transmitted;
int done = 1;
void* request = subGroup->requests[step%NCCL_STEPS];
if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL));
if (done) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup + i;
+59 -24
View File
@@ -106,6 +106,7 @@ static void* ncclIbAsyncThreadMain(void* args) {
}
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
char devicePath[PATH_MAX];
@@ -117,7 +118,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
// Merge multi-port NICs into the same PCI device
p[strlen(p)-1] = '0';
// Also merge virtual functions (VF) into the same device
p[strlen(p)-3] = '0';
if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
// And keep the real port aside (the ibv port is always 1 on recent cards)
*realPort = 0;
for (int d=0; d<ncclNIbDevs; d++) {
@@ -403,16 +404,25 @@ struct ncclIbHandle {
struct ncclIbCommStage stage; // Used by the other side when connecting
};
// Retain local and remote RoCE addresses for error logging
struct ncclIbGidInfo {
uint8_t link_layer;
union ibv_gid localGid;
union ibv_gid remoteGid;
};
#define NCCL_NET_IB_REQ_UNUSED 0
#define NCCL_NET_IB_REQ_SEND 1
#define NCCL_NET_IB_REQ_RECV 2
#define NCCL_NET_IB_REQ_FLUSH 3
const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
struct ncclIbRequest {
struct ncclIbVerbs* verbs;
int type;
int events;
struct ncclSocket* sock;
struct ncclIbGidInfo* gidInfo;
int nreqs;
union {
struct {
@@ -462,8 +472,10 @@ struct ncclIbSendComm {
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
int qpIndex;
struct ibv_mr* fifoMr;
int ar;
struct ncclIbGidInfo gidInfo;
};
// The SendFifo needs to be 32-byte aligned and each element needs
// to be a 32-byte multiple, so that an entry does not get split and
@@ -496,7 +508,9 @@ struct ncclIbRecvComm {
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
int qpIndex;
struct ncclIbGpuFlush gpuFlush;
struct ncclIbGidInfo gidInfo;
};
static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
@@ -682,15 +696,14 @@ ib_connect_check:
// RoCE support
qpInfo.lid = portAttr.lid;
qpInfo.link_layer = portAttr.link_layer;
qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer;
if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
for (int q=0; q<comm->nqps; q++)
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
} else { // RoCE
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
qpInfo.spn = gid.global.subnet_prefix;
qpInfo.iid = gid.global.interface_id;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix;
qpInfo.iid = comm->gidInfo.localGid.global.interface_id;
for (int q=0; q<comm->nqps; q++)
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
}
@@ -716,6 +729,8 @@ ib_connect:
memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
@@ -777,6 +792,9 @@ ib_recv:
/* copy back the received info */
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
// IB setup
struct ibv_context* ctx;
uint8_t ib_port;
@@ -784,8 +802,7 @@ ib_recv:
ib_port = ncclIbDevs[lComm->dev].port;
struct ibv_port_attr portAttr;
NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid));
// QP Creation
NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
@@ -812,7 +829,8 @@ ib_recv:
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
// Allocate Flush dummy buffer for GPU Direct RDMA
rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
rComm->gpuFlush.enabled = ((ncclIbGdrSupport(lComm->dev) == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
&& (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
if (rComm->gpuFlush.enabled) {
NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem;
@@ -823,8 +841,8 @@ ib_recv:
localQpInfo.lid=portAttr.lid;
localQpInfo.link_layer=portAttr.link_layer;
localQpInfo.ib_port=ib_port;
localQpInfo.spn=gid.global.subnet_prefix;
localQpInfo.iid=gid.global.interface_id;
localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
localQpInfo.mtu=portAttr.active_mtu;
NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo));
NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
@@ -833,11 +851,11 @@ ib_recv:
// Fill Handle
struct ncclIbQpInfo qpInfo;
qpInfo.lid=portAttr.lid;
qpInfo.link_layer=portAttr.link_layer;
qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer;
qpInfo.ib_port=ib_port;
for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
qpInfo.spn=gid.global.subnet_prefix;
qpInfo.iid=gid.global.interface_id;
qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
qpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
qpInfo.mtu=remQpInfo.mtu;
stage->state = ncclIbCommStateSend;
@@ -875,6 +893,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
r->verbs = verbs;
r->events = 1;
r->sock = NULL;
r->gidInfo = NULL;
*req = r;
return ncclSuccess;
}
@@ -979,6 +998,8 @@ returning:
return res;
}
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
@@ -1034,9 +1055,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
// Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
const int align = 128;
for (int q=0; q<comm->nqps; q++) {
const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
for (int q=0; q<nqps; q++) {
for (int r=0; r<nreqs; r++) {
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
if (length <= 0) {
comm->wrs[r].sg_list = NULL;
@@ -1048,10 +1070,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
}
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr));
comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
for (int r=0; r<nreqs; r++) {
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
reqs[r]->send.offset += chunkSize;
comm->sges[r].addr += chunkSize;
comm->wrs[r].wr.rdma.remote_addr += chunkSize;
@@ -1111,7 +1134,8 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
req->send.data = data;
req->send.lkey = mr->lkey;
req->send.offset = 0;
req->events = comm->nqps;
req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
*request = reqs[r] = req;
// If this is a multi-recv, send only when all requests have matched.
@@ -1205,6 +1229,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
req->type = NCCL_NET_IB_REQ_RECV;
req->sock = &comm->sock;
req->nreqs = n;
if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
struct ibv_recv_wr wr;
@@ -1215,13 +1240,15 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
wr.num_sge = 0;
TIME_START(1);
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
for (int q=0; q<nqps; q++) {
struct ibv_qp* qp = comm->qps[comm->qpIndex];
struct ibv_recv_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
}
TIME_STOP(1);
req->events = comm->nqps;
req->events = nqps;
*request = req;
@@ -1292,8 +1319,16 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->sock, &addr);
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
char localGidString[INET6_ADDRSTRLEN] = "";
char remoteGidString[INET6_ADDRSTRLEN] = "";
const char* localGidStr = NULL, *remoteGidStr = NULL;
if (r->gidInfo) {
localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString));
remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString));
}
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s",
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString);
return ncclRemoteError;
}
+181 -136
View File
@@ -43,22 +43,7 @@ struct ncclTransport nvlsTransport = {
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
};
#define NVLS_HANDLE_SIZE 64
struct nvlsResources {
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
};
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) {
CUmulticastObjectProp* prop = &resources->properties;
memset(prop, 0, sizeof(*prop));
prop->size = size;
@@ -81,7 +66,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* reso
return ncclSuccess;
}
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) {
size_t size = resources->size;
// Create a Multicast group
@@ -103,24 +88,13 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resour
return ncclSuccess;
}
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
@@ -131,9 +105,11 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
int fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
struct ncclProxyConnector proxyConn;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
int tpProxyRank = comm->topParentRanks[rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle));
fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
} else {
@@ -146,7 +122,20 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = *(int *)resources->shareableHandle;
(void) close(fd);
}
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size = resources->size;
size_t granularity;
CUdeviceptr ptr = 0;
@@ -178,7 +167,21 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resou
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
// Release the MC group resources
NCCLCHECK(nvlsGroupDisconnect(comm, resources));
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size = resources->size;
CUdeviceptr ptr = 0;
@@ -196,7 +199,7 @@ ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resour
return ncclSuccess;
}
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
size_t size;
CUdeviceptr ptr;
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
@@ -224,135 +227,172 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* reso
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
comm->nvlsSupport = 0;
comm->nvlsChannels = 0;
int gpuCount;
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
CUdevice dev;
int driverVersion;
if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
CUCHECK(cuCtxGetDevice(&dev));
CUDACHECK(cudaDriverGetVersion(&driverVersion));
comm->nvlsSupport = 0;
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 0) return ncclSuccess;
int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
int rank = comm->localRank, nranks = comm->localRanks;
for (int c=0; c<nChannels; c++) {
NCCLCHECK(initChannel(comm, c));
}
ncclResult_t res = ncclSuccess;
struct nvlsResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
size_t nvlsTotalSize = nvlsPerRankSize*nranks;
INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* nvlsShareableHandle = NULL;
NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
if (rank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
if (ncclParamNvlsEnable() == 2) {
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
comm->nvlsSupport = 1;
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
return ncclSuccess;
}
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nranks;
for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
channel->nvls.down = comm->nRanks+1+comm->localRank;
channel->nvls.out = -1; // Network not yet implemented.
channel->nvls.headRank = comm->localRank; // Network not yet implemented.
}
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
for (int r=0; r<nranks; r++) {
int nvlsPeer = comm->nRanks+1+r;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.up[r] = nvlsPeer;
int nHeads = comm->channels[0].nvls.nHeads;
int headRank = comm->channels[0].nvls.headRank;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
CUdevice dev;
CUCHECK(cuCtxGetDevice(&dev));
// Reduce UC -> MC
mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
ncclResult_t res = ncclSuccess;
bool nvlsShare = true;
if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks)
nvlsShare = true;
else
nvlsShare = false;
// Broadcast MC -> UC
mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
if (nvlsShare) {
/* reuse NVLS resources */
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
for (int c = 0; c < comm->nvlsChannels; c++) {
NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
}
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
comm->nvlsResources = parent->nvlsResources;
ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
} else {
int nChannels;
struct ncclNvlsSharedRes* resources;
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
resources->refCount = 1;
if (parent && parent->config.splitShare) {
/* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels
* to make sure nvlsChannels match for each rank. */
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
}
nChannels = resources->nChannels = comm->nvlsChannels;
for (int c = 0; c < nChannels; c++) {
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
}
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* shareableHandle = resources->shareableHandle;
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
if (comm->localRank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup);
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
for (int h = 0; h < nHeads; h++) {
int nvlsPeer = comm->nRanks + 1 + h;
for (int c = 0; c < nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers[nvlsPeer];
// Reduce UC -> MC
mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
// Broadcast MC -> UC
mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
struct ncclDevChannelPeer* addr;
CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/
}
}
}
free(nvlsShareableHandle);
return res;
cleanup:
comm->nvlsSupport = 0;
free(nvlsShareableHandle);
return res;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources;
if (resources == NULL) return ncclSuccess;
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) {
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
}
return ncclSuccess;
}
@@ -362,7 +402,12 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
* Pre CUDA 12.1 stubs
*/
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
comm->nvlsChannels = 0;
return ncclSuccess;
}
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
return ncclSuccess;
}
+280 -103
View File
@@ -11,17 +11,21 @@
#include "shm.h"
#include "graph.h"
#include "graph/topo.h"
#include "p2p.h"
enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
struct ncclP2pBuff {
void* directPtr;
cudaIpcMemHandle_t devIpc;
size_t size;
ncclIpcDesc ipcDesc;
};
struct p2pConnectInfo {
int rank;
int read;
struct ncclP2pBuff p2pBuff;
// Use by CE memcpy
// Used by CE memcpy
char shmName[7];
int shmSize;
};
@@ -31,7 +35,7 @@ struct p2pShm {
struct ncclSendMem sendMem;
struct ncclRecvMem recvMem;
};
struct p2pProxyInfo {
struct p2pShmProxyInfo {
// Shared memory between proxy and receiving GPU
struct p2pShm* shm;
struct p2pShm* devShm;
@@ -46,29 +50,33 @@ struct p2pProxyInfo {
// Receiver buffer
char* recvFifo;
// Used by progress only
// Used by CE memcpy progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
};
static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
struct p2pSendResources {
struct ncclSendMem* devMem;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
void* sendMemIpc;
void* recvMemIpc;
struct p2pProxyInfo proxyInfo;
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
struct p2pResources {
enum p2pType type;
union {
struct ncclSendMem* sendDevMem;
struct ncclRecvMem* recvDevMem;
};
void* sendMemIpc;
void* recvMemIpc;
// CE memcpy support
struct p2pShmProxyInfo proxyInfo;
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
ncclShmHandle_t handle;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
};
// cuMem API support
struct p2pCuMemProxyInfo {
struct ncclP2pBuff p2pBuff;
};
#include <sys/types.h>
@@ -90,6 +98,7 @@ static int busIdToCudaDev(int64_t busId) {
return -1;
}
// CE memcpy support
NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
static int useMemcpy = 0;
static void initCeOperation();
@@ -149,14 +158,11 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
*ret = 0;
return ncclSuccess;
}
if (p2p == 0 && cudaDev1 == cudaDev2 && info1->busId == info2->busId) {
p2p = 1;
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
// Check that legacy IPC support is available
if (p2p != 0) {
// This will always fail when using NCCL_CUMEM_ENABLE=1
if (p2p != 0 && !ncclCuMemEnable()) {
// Cached result of the legacyIPC detection
static int legacyIPC = -1;
if (legacyIPC >= 0) {
@@ -166,12 +172,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check that legacy IPC support is available (WSL WAR)
char *dummy;
cudaIpcMemHandle_t ipc;
NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN));
if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
*ret = 0;
}
CUDACHECK(cudaFree(dummy));
NCCLCHECK(ncclCudaFree(dummy));
legacyIPC = *ret;
return ncclSuccess;
}
@@ -193,6 +199,98 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
} while (0)
// cuMem API support
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
#else
return ncclInternalError;
#endif
} else {
// Allocate a CUDA buffer and generate an IPC handle for it
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true));
cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
ncclCudaFree(*ptr);
CUDACHECK(res);
}
}
INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc);
return ncclSuccess;
}
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
if (ncclCuMemEnable()) {
// cuMem API support
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int fd = *(int *) &ipcDesc->cuDesc.data;
if (fd <= 0) return ncclInternalError;
(void) close(fd);
}
}
return ncclSuccess;
}
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
CUdeviceptr dptr = 0;
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
struct ncclProxyConnector proxyConn;
int fd = *(int *)(&cuDesc->data);
int newFd = -1;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd));
INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer);
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type));
close(newFd);
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
}
CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0));
TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr);
// Allow access by the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = comm->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
*devMemPtr = (void *)dptr;
#else
return ncclInternalError;
#endif
} else {
// Legacy CUDA IPC
CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess));
}
INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr);
return ncclSuccess;
}
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
@@ -209,10 +307,11 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
return ncclSuccess;
}
static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (myInfo->pidHash == peerInfo->pidHash) {
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
if (peerInfo->cudaDev != myInfo->cudaDev) {
// Enable P2P access
// Same PID different GPUs, enable P2P access
// Legacy CUDA IPC
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
@@ -225,8 +324,15 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess));
*ipcPtr = *devMem;
if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
// Same PID and GPU
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
// Different PID or different GPU
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
*ipcPtr = *devMem;
}
}
return ncclSuccess;
}
@@ -234,7 +340,8 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct p2pSendResources* resources;
struct p2pResources* resources;
int tpProxyRank;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead, intermediateRank;
@@ -261,35 +368,47 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
resources->type = P2P_DIRECT;
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
} else {
// cuMem API support
if (ncclCuMemEnable()) {
resources->type = P2P_CUMEM;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
} else {
// Legacy CUDA IPC
resources->type = P2P_IPC;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
}
send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
}
} else {
resources->type = P2P_INTERMEDIATE;
info->rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
if (useMemcpy) {
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
} else {
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
}
return ncclSuccess;
@@ -298,7 +417,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
struct p2pRecvResources* resources;
struct p2pResources* resources;
int tpProxyRank;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead, intermediateRank;
@@ -312,44 +432,56 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int recvSize = sizeof(struct ncclRecvMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
resources->type = P2P_DIRECT;
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
if (ncclCuMemEnable()) {
// cuMem API support
resources->type = P2P_CUMEM;
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM",
channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
} else {
// Legacy CUDA IPC
resources->type = P2P_IPC;
}
recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}
} else {
resources->type = P2P_INTERMEDIATE;
info->rank = intermediateRank;
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
return ncclSuccess;
}
/* Connect/Send to this peer */
static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct ncclRecvMem* remDevMem;
struct p2pResources* resources = (struct p2pResources*)send->transportResources;
struct ncclRecvMem* remDevMem = NULL;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
char* buff = (char*)(remDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
send->conn.buffs[p] = (char*)(resources->devMem+1);
if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
send->conn.buffs[p] = (char*)(resources->sendDevMem+1);
} else {
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
}
@@ -358,20 +490,20 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
// Send SIMPLE buff to proxy, and replace it by local buffer
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
} else {
send->conn.tail = &remDevMem->tail;
send->conn.head = &resources->devMem->head;
send->conn.ptrExchange = &resources->devMem->ptrExchange;
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
send->conn.head = &resources->sendDevMem->head;
send->conn.ptrExchange = &resources->sendDevMem->ptrExchange;
send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange;
}
return ncclSuccess;
}
/* Connect/Recv from this peer */
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
struct ncclSendMem* remDevMem = NULL;
@@ -381,20 +513,22 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
// Attach to peer's SHM segment
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
} else {
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
recv->conn.tail = &resources->devMem->tail;
struct ncclRecvMem* devMem = resources->recvDevMem;
recv->conn.tail = &devMem->tail;
recv->conn.head = &remDevMem->head;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
}
char* buff = (char*)(resources->devMem+1);
char* buff = (char*)(resources->recvDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
@@ -402,93 +536,113 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
recv->conn.buffs[p] = (char*)(remDevMem+1);
} else {
recv->conn.buffs[p] = buff;
buff += recv->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
}
return ncclSuccess;
}
ncclResult_t p2pSendFree(struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct p2pResources* resources = (struct p2pResources*)send->transportResources;
if (resources) {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
}
free(resources);
}
return ncclSuccess;
}
ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
if (resources) {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
}
}
free(resources);
}
return ncclSuccess;
}
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo;
// CE memcpy support
struct p2pShmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream, true));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true));
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
// Create a SHM segment for the peer to attach to
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
} else {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true));
connection->transportResources = p2pBuff->directPtr;
cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
cudaFree(p2pBuff->directPtr);
free(p2pBuff);
CUDACHECK(res);
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
connection->transportResources = proxyInfo;
} else {
connection->transportResources = p2pBuff->directPtr;
}
}
*done = 1;
return ncclSuccess;
}
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true));
connection->transportResources = p2pBuff->directPtr;
cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
cudaFree(p2pBuff->directPtr);
free(p2pBuff);
CUDACHECK(res);
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
connection->transportResources = proxyInfo;
} else {
connection->transportResources = p2pBuff->directPtr;
}
*done = 1;
return ncclSuccess;
}
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (reqSize != sizeof(void*)) return ncclInternalError;
proxyInfo->recvFifo = *((char**)reqBuff);
@@ -501,13 +655,14 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
// CE memcpy support
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (proxyInfo) {
NCCLCHECK(ncclShmClose(proxyInfo->handle));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
@@ -515,23 +670,45 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
cudaFree(connection->transportResources);
if (ncclCuMemEnable()) {
// cuMem API support
struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
if (proxyInfo) {
struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
ncclCudaFree(p2pBuff->directPtr);
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
ncclCudaFree(connection->transportResources);
}
}
return ncclSuccess;
}
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
// Do not check return code as CUDA may have already shut down
cudaFree(connection->transportResources);
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
if (ncclCuMemEnable()) {
struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
if (proxyInfo) {
struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
ncclCudaFree(p2pBuff->directPtr);
free(proxyInfo);
}
} else {
// Do not check return code as CUDA may have already shut down
ncclCudaFree(connection->transportResources);
}
return ncclSuccess;
}
static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
// CE memcpy support
static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
@@ -541,10 +718,10 @@ static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
resources->step = sub->base + sub->nsteps;
args->done++;
+21 -19
View File
@@ -85,7 +85,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclSendMem);
if (shmLocality == SHM_SEND_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@@ -108,7 +108,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclRecvMem);
if (shmLocality == SHM_RECV_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@@ -146,7 +146,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.head = &resources->devHostMem->head;
@@ -155,9 +155,11 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
}
if (useMemcpySend) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
int tpProxyRank;
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
send->conn.tail = &proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@@ -179,7 +181,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = buff;
buff += recv->comm->buffSizes[p];
buff += comm->buffSizes[p];
}
recv->conn.head = &resources->devRemHostMem->head;
recv->conn.tail = &resources->devHostMem->tail;
@@ -187,7 +189,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
}
@@ -214,12 +216,12 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
@@ -232,12 +234,12 @@ static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream));
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
@@ -250,12 +252,12 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection,
return ncclSuccess;
}
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
CUDACHECK(cudaFree(resources->devFifo));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
@@ -265,12 +267,12 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
CUDACHECK(cudaFree(resources->devFifo));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
@@ -280,7 +282,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -294,7 +296,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
@@ -339,7 +341,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
return ncclSuccess;
}
static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -353,7 +355,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxy
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+1 -1
View File
@@ -196,7 +196,7 @@ namespace RcclUnitTesting
if (useMultiRankPerGpu)
{
if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
//if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
{
ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId);
status = TEST_FAIL;
+1 -1
View File
@@ -6,7 +6,7 @@ endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
@@ -69,7 +69,7 @@ public:
int rankToCudaDev(int rank) {
for (int i=0; i<getNumGpus(); i++) {
if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank[0])
if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
return systems[0]->nodes[GPU].nodes[i].gpu.dev;
}
return -1;
@@ -77,7 +77,7 @@ public:
int64_t getGpuBusId(int rank) {
for (int i=0; i<getNumGpus(); i++) {
if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank[0])
if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
return systems[0]->nodes[GPU].nodes[i].id;
}
return -1;
@@ -93,7 +93,7 @@ public:
void setRanks() {
for (int r=0; r<getNumGpus(); r++)
for (int i=0; i<getNumGpus(); i++)
systems[r]->nodes[GPU].nodes[i].gpu.rank[0] += firstRank;
systems[r]->nodes[GPU].nodes[i].gpu.rank += firstRank;
}
int p2pCanConnect(int device1, int device2) { return 1; }
@@ -133,4 +133,4 @@ public:
NetworkModel() : nRanks(0) {}
};
#endif
#endif
+75 -30
View File
@@ -1,6 +1,7 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,24 +13,25 @@
#include <hip/hip_fp16.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 14
#define NCCL_PATCH 3
#define NCCL_MINOR 18
#define NCCL_PATCH 1
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 21403
#define NCCL_VERSION_CODE 21801
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#define RCCL_MULTIRANKPERGPU 1
#ifdef __cplusplus
extern "C" {
#endif
/*! @brief Opaque handle to communicator */
#include <limits.h>
typedef struct ncclComm* ncclComm_t;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
@@ -45,15 +47,24 @@ typedef enum { ncclSuccess = 0,
ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t;
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1
/* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */
typedef struct ncclConfig_v21400 {
typedef struct ncclConfig_v21700 {
/* attributes that users should never touch. */
size_t size;
unsigned int magic;
unsigned int version;
/* attributes that users are able to customize. */
int blocking;
int cgaClusterSize;
int minCTAs;
int maxCTAs;
const char *netName;
int splitShare;
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
@@ -62,7 +73,12 @@ typedef struct ncclConfig_v21400 {
sizeof(ncclConfig_t), /* size */ \
0xcafebeef, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
1 /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
}
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
@@ -117,28 +133,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
/// @endcond
/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device.
@details
rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a HIP device, which has to be set before calling
ncclCommInitRankMulti.
Since this version of the function allows multiple ranks to utilize the same
HIP device, a unique virtualId per device has to be provided by each calling
rank.
ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@param[in]
comm ncclComm_t*
communicator struct pointer
*/
ncclResult_t ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
/// @cond include_hidden
ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
/// @endcond
/*! @brief Creates a clique of communicators (single process version).
*
* @details This is a convenience function to create a single-process communicator clique.
@@ -177,6 +171,19 @@ ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/// @endcond
/*! @brief Creates one or more communicators from an existing one.
* Ranks with the same color will end up in the same communicator.
* Within the new communicator, key will be used to order ranks.
* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
* and will therefore return a NULL communicator.
* If config is NULL, the new communicator will inherit the original communicator's
* configuration*/
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/// @cond include_hidden
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/// @endcond
/* Returns a string for each error code. */
/*! @brief Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
/// @cond include_hidden
@@ -188,7 +195,7 @@ const char* pncclGetErrorString(ncclResult_t result);
*/
const char* ncclGetLastError(ncclComm_t comm);
/// @cond include_hidden
const char* pncclGetError(ncclComm_t comm);
const char* pncclGetLastError(ncclComm_t comm);
/// @endcond
/* Checks whether the comm has encountered any asynchronous errors */
@@ -498,6 +505,44 @@ ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Opaque handle to MSCCL algorithm */
typedef int mscclAlgoHandle_t;
/*! @brief MSCCL Load Algorithm
*
* @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
* its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
* scheduler instead of end users.
*/
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
/*! @brief MSCCL Run Algorithm
*
* @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
* list merges all possible parameters required by different operations as this
* is a general-purposed API. This API is expected to be called by MSCCL
* scheduler instead of end users.
*/
ncclResult_t mscclRunAlgo(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
ncclResult_t pmscclRunAlgo(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
/*! @brief MSCCL Load Algorithm
*
* @details Unload MSCCL algorithm previous loaded using its handle. This API
* is expected to be called by MSCCL scheduler instead of end users.
*/
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
/*
* Group semantics
*
+8 -13
View File
@@ -8,8 +8,7 @@
#ifndef UTILS_H_
#define UTILS_H_
// AllGather3 - begin
struct ncclGraphInfo {
struct graphInfo {
int pattern;
int nChannels;
int sameChannels;
@@ -19,14 +18,10 @@ struct ncclGraphInfo {
int typeInter;
};
struct allGather3Data_t{
int netDev;
int collNetSupport;
int nc;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
struct allGatherInfo {
struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
struct ncclTopoRanks topoRanks;
int nc;
bool pivotA2AEnabled;
bool ll128Enabled;
bool mscclEnabled;
@@ -40,11 +35,11 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash);
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent = NULL);
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph);
#define TIME_START(index)
+3 -3
View File
@@ -216,10 +216,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
} else {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
}
*((int*)connectInfo) = proxyRank;
@@ -242,7 +242,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
return ncclSuccess;
}
+30 -6
View File
@@ -153,10 +153,15 @@ NodeModelDesc model_descs[] = {
{2, "topo_8p1h_5.xml", "2 nodes 8P1H Alt."},
};
NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);
NCCL_PARAM(MinCTAs, "MIN_CTAS", 1);
int main(int argc,char* argv[])
{
struct ncclComm *comm;
const int num_models = sizeof(model_descs) / sizeof(*model_descs);
int minCTAsEnv;
int maxCTAsEnv;
if (!cmdOptionExists(argv, argv + argc, "-m")) {
printf("Usage: ./topo_expl -m model_id\n");
@@ -200,18 +205,22 @@ int main(int argc,char* argv[])
node_model->rankToCudaDev(i), node_model->getGpuBusId(i));
}
minCTAsEnv = ncclParamMinCTAs();
maxCTAsEnv = ncclParamMaxCTAs();
NCCLCHECK(ncclCalloc(&comm, nranks));
struct ncclPeerInfo *peerInfo;
NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root
struct allGather3Data_t *allGather3Data;
struct allGatherInfo* allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph, *nvlsGraph;
NCCLCHECK(ncclCalloc(&treeGraph, nranks));
NCCLCHECK(ncclCalloc(&ringGraph, nranks));
NCCLCHECK(ncclCalloc(&collNetGraph, nranks));
NCCLCHECK(ncclCalloc(&nvlsGraph, nranks));
for (int i = 0; i < nranks; i++) {
comm[i].rank = i;
@@ -224,8 +233,23 @@ int main(int argc,char* argv[])
comm[i].topo = node_model->getSystem(i);
comm[i].peerInfo = peerInfo;
comm[i].ncclNet = ncclNet;
comm[i].virtualId = -1;
// Mark channels as non initialized.
comm[i].config.maxCTAs = maxCTAsEnv;
comm[i].config.minCTAs = minCTAsEnv;
if (comm[i].topParentRanks == NULL) {
NCCLCHECK(ncclCalloc(&comm[i].topParentRanks, comm->nRanks));
for (int j = 0; j < comm->nRanks; ++j)
comm[i].topParentRanks[j] = j;
}
struct ncclSharedResources* sharedRes = NULL;
NCCLCHECK(ncclCalloc(&sharedRes, 1));
/* most of attributes are assigned later in initTransportsRank(). */
sharedRes->owner = &comm[i];
sharedRes->tpNRanks = comm[i].nRanks;
NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm[i].nRanks));
comm[i].sharedRes = sharedRes;
sharedRes->refCount = 1;
ncclMemoryStackConstruct(&comm[i].memPermanent);
// Mark channels as non initialized.
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
}
@@ -233,13 +257,13 @@ int main(int argc,char* argv[])
for (int i = 0; i < nranks; i++) {
node_model = network.GetNode(i);
assert(node_model!=0);
initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
}
for (int i = 0; i < nranks; i++) {
node_model = network.GetNode(i);
assert(node_model!=0);
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
}
for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
File diff suppressed because it is too large Load Diff