Merge remote-tracking branch 'nccl/master' into develop
Этот коммит содержится в:
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 13
|
||||
NCCL_PATCH := 4
|
||||
NCCL_MINOR := 14
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+7
-1
@@ -9,7 +9,7 @@ include ../makefiles/version.mk
|
||||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
|
||||
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
||||
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
||||
@@ -29,6 +29,12 @@ OBJDIR := $(BUILDDIR)/obj
|
||||
PKGDIR := $(BUILDDIR)/lib/pkgconfig
|
||||
##### target files
|
||||
CUDARTLIB ?= cudart_static
|
||||
|
||||
ifeq ($(CUDARTLIB), cudart_static)
|
||||
# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
|
||||
LIBSRCFILES += enhcompat.cc
|
||||
endif
|
||||
|
||||
INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
|
||||
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
|
||||
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
|
||||
|
||||
+13
-16
@@ -105,8 +105,8 @@ static void *bootstrapRoot(void* args) {
|
||||
/* Receive addresses from all ranks */
|
||||
do {
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = NULL;
|
||||
/* bootstrap root thread always uses blocking ncclSocketAccept. */
|
||||
NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out);
|
||||
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
|
||||
close(sock.fd);
|
||||
@@ -236,16 +236,17 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
struct ncclSocket sock, listenSockRoot;
|
||||
sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag;
|
||||
sock.asyncFlag = listenSockRoot.asyncFlag = 0;
|
||||
|
||||
NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0));
|
||||
NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0));
|
||||
NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0));
|
||||
NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0));
|
||||
NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0));
|
||||
// Create socket for other ranks to contact me
|
||||
memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketListen(&state->listenSock));
|
||||
memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
|
||||
|
||||
// Create socket for root to contact me
|
||||
memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketListen(&listenSockRoot));
|
||||
memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
|
||||
|
||||
@@ -260,7 +261,6 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
memcpy(&sock.addr, id, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketConnect(&sock));
|
||||
NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
|
||||
close(sock.fd);
|
||||
@@ -284,8 +284,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
|
||||
struct ncclSocket* proxySocket;
|
||||
NCCLCHECK(ncclCalloc(&proxySocket, 1));
|
||||
proxySocket->abortFlag = NULL; // proxy is aborted through a message
|
||||
memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0));
|
||||
NCCLCHECK(ncclSocketListen(proxySocket));
|
||||
memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||
@@ -325,9 +324,8 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = state->abortFlag;
|
||||
sock.asyncFlag = 0;
|
||||
memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
|
||||
|
||||
NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1));
|
||||
NCCLCHECK(ncclSocketConnect(&sock));
|
||||
NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
|
||||
@@ -416,9 +414,7 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag,
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = state->abortFlag;
|
||||
|
||||
// Search unexpected connections first
|
||||
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
|
||||
@@ -429,6 +425,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
|
||||
}
|
||||
|
||||
// Then look for new connections
|
||||
NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0));
|
||||
while (1) {
|
||||
NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
|
||||
int newPeer, newTag;
|
||||
@@ -450,9 +447,9 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
close(state->listenSock.fd);
|
||||
close(state->ringSendSocket.fd);
|
||||
close(state->ringRecvSocket.fd);
|
||||
if (state->listenSock.fd >= 0) close(state->listenSock.fd);
|
||||
if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd);
|
||||
if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd);
|
||||
|
||||
free(state->peerCommAddresses);
|
||||
free(state);
|
||||
|
||||
@@ -40,7 +40,10 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
if (channel->id == -1) return ncclSuccess;
|
||||
/* channel peers are only valid when async init thread completes commAlloc() and
|
||||
* the channel is intialized with initChannel(); if either is not done, this channel
|
||||
* should never be free. */
|
||||
if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
|
||||
|
||||
// Free transport proxy resources
|
||||
// Note: free all send resources first due to CollNet arrangement
|
||||
|
||||
@@ -29,7 +29,7 @@ all: $(STATICLIB)
|
||||
all_deps: $(DEPENDFILES)
|
||||
|
||||
# Auto-generating the rules per op/reduction/datatype/algorithm
|
||||
$(RULESFILE) :
|
||||
$(RULESFILE) : gen_rules.sh
|
||||
@printf "Generating %-35s > %s\n" rules $@
|
||||
@mkdir -p $(OBJDIR)
|
||||
@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
|
||||
|
||||
@@ -584,22 +584,22 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SI
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
static constexpr int COLLNET_COPY_THREADS = 64;
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* tree = &ncclShmem.channel.collTree;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
|
||||
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
|
||||
|
||||
const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
|
||||
const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
|
||||
const int nThreadsScatter = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
|
||||
const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
|
||||
const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
|
||||
const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsBcast = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
|
||||
const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
|
||||
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
|
||||
const int tidStartBcast = nThreadsGather;
|
||||
const int tidStartScatter = tidStartBcast + nThreadsBcast;
|
||||
@@ -611,24 +611,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
// Scatter
|
||||
int group = (2*Proto::MaxGroupWidth) | (1<<16);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
|
||||
int nelem = min(tree->nHeads*chunkSize, size-offset);
|
||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||
if (args->regUsed) {
|
||||
prims.directScatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
|
||||
prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
||||
} else {
|
||||
prims.scatter(offset, nelem, chunkSize, tree->headRank, tree->shift);
|
||||
prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
||||
}
|
||||
}
|
||||
} else if (tid >= tidStartReduce && tree->out != -1) {
|
||||
} else if (tid >= tidStartReduce && direct->out != -1) {
|
||||
int group = (3*Proto::MaxGroupWidth) | (1<<16);
|
||||
if (hasDn) {
|
||||
// Reduce, send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (args->regUsed) {
|
||||
prims.directRecvReduceSend(offset, offset, nelem);
|
||||
@@ -639,9 +639,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
} else {
|
||||
// Directly send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
|
||||
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
@@ -650,29 +650,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
// Gather
|
||||
int group = (0*Proto::MaxGroupWidth) | (0<<16);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
|
||||
int nelem = min(tree->nHeads*chunkSize, size-offset);
|
||||
prims.directGather(offset, nelem, chunkSize, tree->headRank, tree->shift);
|
||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||
prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift);
|
||||
}
|
||||
} else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) {
|
||||
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
|
||||
int group = (1*Proto::MaxGroupWidth) | (0<<16);
|
||||
if (hasDn) {
|
||||
// Recv from network, broadcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
} else {
|
||||
// Recv from network (no post thread needed)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recv(offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
@@ -681,6 +681,73 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem.channel.collnetChain;
|
||||
ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
|
||||
int nthreadsSplit = nthreads/2;
|
||||
if (nthreadsSplit >= 256) nthreadsSplit += 64;
|
||||
|
||||
int group, send, recv, groupTid, groupNthreads;
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
if (tid < nthreadsSplit) {
|
||||
group = (0*Proto::MaxGroupWidth) | (1<<16);
|
||||
recv = tree->down[0];
|
||||
send = tree->up;
|
||||
groupTid = tid;
|
||||
groupNthreads = nthreadsSplit;
|
||||
} else {
|
||||
group = (1*Proto::MaxGroupWidth);
|
||||
recv = tree->up;
|
||||
send = tree->down[0];
|
||||
groupTid = tid - nthreadsSplit;
|
||||
groupNthreads = nthreads-nthreadsSplit;
|
||||
}
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group);
|
||||
|
||||
if (tid < nthreadsSplit) {
|
||||
if (recv == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
} else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (send == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
} else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecvCopySend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
|
||||
@@ -35,7 +35,8 @@
|
||||
#define NCCL_FUNC4(func, devredop, type, nullify) \
|
||||
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, RING, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
|
||||
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
|
||||
@@ -120,7 +121,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
|
||||
#define NCCL_FUNC4_LL128(func, devredop, type, nullify) \
|
||||
NCCL_FUNC5_LL128(func, TREE, devredop, type, nullify), \
|
||||
NCCL_FUNC5_LL128(func, RING, devredop, type, nullify), \
|
||||
NCCL_FUNC5_LL128(func, COLLNET, devredop, type, nullify)
|
||||
NCCL_FUNC5_LL128(func, COLLNET_DIRECT, devredop, type, nullify), \
|
||||
NCCL_FUNC5_LL128(func, COLLNET_CHAIN, devredop, type, nullify)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A_LL128(func, devredop, nullForFloat) \
|
||||
@@ -214,8 +216,8 @@ struct Caller<f, f + 1, u>{
|
||||
void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
|
||||
};
|
||||
|
||||
static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index");
|
||||
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function index");
|
||||
static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index");
|
||||
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index");
|
||||
|
||||
template<bool USING_LL128>
|
||||
__forceinline__
|
||||
@@ -238,49 +240,65 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
|
||||
ncclFunction_AllReduce_TREE_LL128_Sum_float();
|
||||
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_TREE_LL_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE))
|
||||
ncclFunction_AllReduce_COLLNET_SIMPLE_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL))
|
||||
ncclFunction_AllReduce_COLLNET_LL_Sum_float();
|
||||
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_LL128_Sum_float();
|
||||
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_LL_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE))
|
||||
ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL))
|
||||
ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_float();
|
||||
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_float();
|
||||
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE))
|
||||
ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_float();
|
||||
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL))
|
||||
ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_float();
|
||||
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_float();
|
||||
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL128))
|
||||
ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_float();
|
||||
else
|
||||
assert("Unsupported function index");
|
||||
#else
|
||||
if (funcIndex < 540) {
|
||||
if (funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t();
|
||||
else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t();
|
||||
if (funcIndex < 720) {
|
||||
if (funcIndex % 12 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
|
||||
else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
|
||||
}
|
||||
else if (funcIndex < 1080) Caller<540, 1080, USING_LL128>::call(funcIndex);
|
||||
else if (funcIndex < 1620) {
|
||||
if (funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t();
|
||||
else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex < 1440) Caller<720, 1440, USING_LL128>::call(funcIndex);
|
||||
else if (funcIndex < 2160) {
|
||||
if (funcIndex % 12 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
|
||||
else if (funcIndex % 12 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
|
||||
else if (USING_LL128 && funcIndex % 12 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
|
||||
else if (!USING_LL128 && funcIndex % 12 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
|
||||
else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
|
||||
}
|
||||
else if (funcIndex < 2700) Caller<1620, 2700, USING_LL128>::call(funcIndex);
|
||||
else if (funcIndex < 3600) Caller<2160, 3600, USING_LL128>::call(funcIndex);
|
||||
else {
|
||||
switch (funcIndex - 2700) {
|
||||
switch (funcIndex - 3600) {
|
||||
case 0:
|
||||
ncclFunction_OneRankReduce_PreMulSum_int8_t();
|
||||
break;
|
||||
@@ -707,7 +725,8 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
|
||||
#define IMPL_COLL3(func, devredop, type, ncclType) \
|
||||
IMPL_COLL4(func, TREE, devredop, type, ncclType) \
|
||||
IMPL_COLL4(func, RING, devredop, type, ncclType) \
|
||||
IMPL_COLL4(func, COLLNET, devredop, type, ncclType)
|
||||
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
|
||||
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType)
|
||||
|
||||
#define IMPL_COLL2(func, devredop) \
|
||||
IMPL_COLL3(func, devredop, int8_t, ncclInt8) \
|
||||
|
||||
@@ -21,7 +21,8 @@ __shared__ ncclShmemData ncclShmem;
|
||||
#define NCCL_FUNC4(func, devredop, type, nullify) \
|
||||
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, RING, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
|
||||
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
|
||||
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
// Must be consistent with ncclDataType_t
|
||||
|
||||
@@ -21,10 +21,18 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
dtn=0
|
||||
# Order must match that of the ncclDataType_t enum
|
||||
for dt in ${datatypes}; do
|
||||
echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
|
||||
# Generate a unique filename for each compilation unit,
|
||||
# otherwise the __nv_module_id may conflict at link time
|
||||
echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
|
||||
echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@"
|
||||
echo " cp \$< \$@"
|
||||
echo ""
|
||||
# Compile the file
|
||||
echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
|
||||
|
||||
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
|
||||
echo " mkdir -p ${dir}"
|
||||
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
|
||||
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
|
||||
echo ""
|
||||
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
|
||||
dtn=$(($dtn + 1))
|
||||
|
||||
@@ -13,7 +13,10 @@ template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
|
||||
class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
|
||||
|
||||
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
||||
// In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
|
||||
// This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
|
||||
static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1;
|
||||
static constexpr int MaxSend = Fan::MaxSend;
|
||||
static constexpr int Input=0, Output=1;
|
||||
RedOp redOp;
|
||||
const int tid;
|
||||
@@ -495,18 +498,19 @@ private:
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
|
||||
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
|
||||
int connIndex = group >> 16;
|
||||
auto *channel = &ncclShmem.channel;
|
||||
barriers = &ncclShmem.groups[this->group].barrier;
|
||||
barrier_next = ncclShmem.groups[this->group].barrier_next;
|
||||
|
||||
auto *channel = &ncclShmem.channel;
|
||||
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
|
||||
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
|
||||
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
|
||||
nrecv++;
|
||||
}
|
||||
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
@@ -236,7 +236,8 @@ private:
|
||||
vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
|
||||
needReload |= flagThread && (vr[u+1] != flag);
|
||||
}
|
||||
} while (__any(needReload) && checkAbort(spins, 0, 0) == 0);
|
||||
needReload &= (0 == checkAbort(spins, 0, 0));
|
||||
} while (__any(needReload));
|
||||
}
|
||||
|
||||
/************* Finish register load **************/
|
||||
@@ -278,7 +279,8 @@ private:
|
||||
vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
|
||||
needReload |= flagThread && (vr[u+1] != flag);
|
||||
}
|
||||
} while (__any(needReload) && checkAbort(spins, i, 0) == 0);
|
||||
needReload &= (0 == checkAbort(spins, i, 0));
|
||||
} while (__any(needReload));
|
||||
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
@@ -409,17 +411,17 @@ public:
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
|
||||
flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
|
||||
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
|
||||
int connIndex = group >> 16;
|
||||
auto *channel = &ncclShmem.channel;
|
||||
barriers = &ncclShmem.groups[this->group].barrier;
|
||||
barrier_next = ncclShmem.groups[this->group].barrier_next;
|
||||
|
||||
auto *channel = &ncclShmem.channel;
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
|
||||
nrecv++;
|
||||
}
|
||||
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
template<typename Proto>
|
||||
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
|
||||
@@ -75,8 +76,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
}
|
||||
} else {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
int const chunkSize = args->chunkSize/sizeof(T);
|
||||
int chunkSize = args->chunkSize/sizeof(T);
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
|
||||
@@ -112,7 +113,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
template<typename Proto>
|
||||
__device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
#if defined(ENABLE_NPKIT)
|
||||
bool isNpKitThread = (tid == 0);
|
||||
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
|
||||
@@ -134,10 +136,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#endif
|
||||
|
||||
if (args->peer != ncclShmem.comm.rank) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
int const chunkSize = args->chunkSize/sizeof(T);
|
||||
int chunkSize = args->chunkSize/sizeof(T);
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
|
||||
@@ -191,10 +193,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
if (args->p2pType == ncclWorkP2pTypeUnused) return;
|
||||
if (tid >= nthreads || args->peer == -1) return;
|
||||
|
||||
// Select Proto here
|
||||
// This is to allow the same kernel to run multiple primitives on different warps (thread groups)
|
||||
if ((group%2) == 0) {
|
||||
runRecv(tid, nthreads, group, args);
|
||||
if (args->proto == NCCL_PROTO_LL) {
|
||||
runRecv<ProtoLL>(tid, nthreads, group, args);
|
||||
} else {
|
||||
runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
|
||||
}
|
||||
} else {
|
||||
runSend(tid, nthreads, group, args);
|
||||
if (args->proto == NCCL_PROTO_LL) {
|
||||
runSend<ProtoLL>(tid, nthreads, group, args);
|
||||
} else {
|
||||
runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
+84
-94
@@ -22,64 +22,18 @@
|
||||
|
||||
static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(func, algo, devredop, dtype) \
|
||||
/*LL */(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
|
||||
/*LL128 */nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/, \
|
||||
/*SIMPLE*/nullptr /*(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)*/
|
||||
|
||||
#define NCCL_FUNC4(func, devredop, type) \
|
||||
(void*)NCCL_FUNC5(func, TREE, devredop, type), \
|
||||
(void*)NCCL_FUNC5(func, RING, devredop, type), \
|
||||
(void*)NCCL_FUNC5(func, COLLNET, devredop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(func, devredop) \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, uint8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int32_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, uint32_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int64_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, uint64_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, half), \
|
||||
(void*)NCCL_FUNC4(func, devredop, float), \
|
||||
(void*)NCCL_FUNC4(func, devredop, double), \
|
||||
(void*)NCCL_FUNC4(func, devredop, rccl_bfloat16)
|
||||
#define NCCL_FUNCS3B(func, devredop) \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
||||
(void*)NCCL_FUNC4(func, devredop, int8_t)
|
||||
|
||||
// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum), /*Sum*/ \
|
||||
NCCL_FUNCS3A(func, Sum), /*Prod*/ \
|
||||
NCCL_FUNCS3A(func, Sum), /*Max*/ \
|
||||
NCCL_FUNCS3A(func, Sum), /*Min*/ \
|
||||
NCCL_FUNCS3A(func, Sum), /*PreMulSum*/ \
|
||||
NCCL_FUNCS3A(func, Sum) /*SumPostDiv*/
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), /*Sum*/ \
|
||||
NCCL_FUNCS3B(func, Sum), /*Prod*/ \
|
||||
NCCL_FUNCS3B(func, Sum), /*Max*/ \
|
||||
NCCL_FUNCS3B(func, Sum), /*Min*/ \
|
||||
NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \
|
||||
NCCL_FUNCS3B(func, Sum) /*SumPostDiv*/
|
||||
struct ncclKernelMatch {
|
||||
void* kernelFn;
|
||||
bool specialized;
|
||||
};
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[4] = {
|
||||
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_KERN_NAME_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_KERN_NAME_LL128(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_KERN_NAME_LL128_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
static ncclKernelMatch const ncclKerns[4] = {
|
||||
{(void *)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
|
||||
{(void *)NCCL_KERN_NAME_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true},
|
||||
{(void *)NCCL_KERN_NAME_LL128(SendRecv, RING, SIMPLE, Sum, int8_t), true},
|
||||
{(void *)NCCL_KERN_NAME_LL128_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true},
|
||||
};
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
|
||||
@@ -91,8 +45,8 @@ size_t ncclKernMaxLocalSize() {
|
||||
hipFuncAttributes attr = {0};
|
||||
size_t max = 0;
|
||||
for (int i = 0; i < numNcclKerns; i++) {
|
||||
if (ncclKerns[i] != nullptr) {
|
||||
CUDACHECKGOTO(hipFuncGetAttributes(&attr, reinterpret_cast<const void*>(ncclKerns[i])), res, error);
|
||||
if (ncclKerns[i].kernelFn != nullptr) {
|
||||
CUDACHECKGOTO(hipFuncGetAttributes(&attr, reinterpret_cast<const void*>(ncclKerns[i].kernelFn)), res, error);
|
||||
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
|
||||
}
|
||||
}
|
||||
@@ -107,7 +61,7 @@ size_t ncclKernLocalSize(int i) {
|
||||
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
||||
hipFuncAttributes attr = {0};
|
||||
if (i < numNcclKerns)
|
||||
CUDACHECKGOTO(hipFuncGetAttributes(&attr, (const void*)(ncclKerns[i])), res, error);
|
||||
CUDACHECKGOTO(hipFuncGetAttributes(&attr, (const void*)(ncclKerns[i].kernelFn)), res, error);
|
||||
|
||||
error:
|
||||
return (res != ncclSuccess) ? 0 : attr.localSizeBytes;
|
||||
@@ -119,7 +73,7 @@ ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
||||
for (int i = 0; i < numNcclKerns; i++) {
|
||||
CUDACHECKGOTO(hipFuncSetAttribute((const void *)ncclKerns[i], hipFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
|
||||
CUDACHECKGOTO(hipFuncSetAttribute((const void *)ncclKerns[i].kernelFn, hipFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
|
||||
}
|
||||
|
||||
error:
|
||||
@@ -311,14 +265,14 @@ static ncclResult_t addCollToPlan(
|
||||
workElemReg.elem = *workElem; // C++ struct assignment
|
||||
workElemReg.elem.regUsed = 1;
|
||||
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
|
||||
int peer = channel->collTree.down[i];
|
||||
int peer = channel->collnetDirect.down[i];
|
||||
if (peer == -1) break;
|
||||
int j = comm->rankToLocalRank[peer]; // Get intra-node slot
|
||||
workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
|
||||
workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
|
||||
}
|
||||
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
|
||||
int peer = channel->collTree.up[i];
|
||||
int peer = channel->collnetDirect.up[i];
|
||||
if (peer == -1) break;
|
||||
int j = comm->rankToLocalRank[peer];
|
||||
// Output buffer of root peer
|
||||
@@ -340,6 +294,8 @@ static ncclResult_t addCollToPlan(
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
|
||||
|
||||
// Put p2p op in plan assuming there is space in nWorkBudget, so you must
|
||||
// ensure *nWorkBudget >= 1 upon entry.
|
||||
static ncclResult_t addP2pToPlan(
|
||||
@@ -357,11 +313,17 @@ static ncclResult_t addP2pToPlan(
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId));
|
||||
info.channelId = channelId;
|
||||
|
||||
// 1 is connIndex
|
||||
struct ncclConnInfo* conn = isSendNotRecv ?
|
||||
&comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn;
|
||||
info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
|
||||
|
||||
struct ncclProxyOp proxyOp = {};
|
||||
NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp));
|
||||
proxyOp.connIndex = connIndex;
|
||||
|
||||
struct ncclWorkElemP2p elem = {0};
|
||||
elem.proto = info.protocol;
|
||||
elem.peer = peer;
|
||||
elem.nWarps = NCCL_MAX_NTHREADS/comm->WarpSize;
|
||||
elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv;
|
||||
@@ -404,9 +366,7 @@ static void finishPlan(struct ncclKernelPlan* plan) {
|
||||
plan->channelCount = channelCount;
|
||||
plan->channelMask = channelMask;
|
||||
plan->hasProxyOps = hasProxyOps;
|
||||
if (plan->kernelFn == nullptr)
|
||||
plan->kernelFn = ncclKernelGeneric;
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, 4*WARP_SIZE);
|
||||
}
|
||||
|
||||
static ncclResult_t registerIntraNodeBuffers(
|
||||
@@ -565,7 +525,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
if (plan->persistent && ncclParamGraphRegister() &&
|
||||
info.algorithm == NCCL_ALGO_COLLNET && // limited to CollNet for now
|
||||
info.algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
|
||||
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
|
||||
comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
|
||||
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv));
|
||||
@@ -579,8 +539,10 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
head = ncclIntruQueueHead(&tasks->collQueue);
|
||||
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
|
||||
if (ncclKerns[ncclGetKernelIndex(comm)] != nullptr)
|
||||
plan->kernelFn = (void *)ncclKerns[ncclGetKernelIndex(comm)];
|
||||
if (!plan->kernelSpecialized) {
|
||||
plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
|
||||
plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -608,11 +570,15 @@ static ncclResult_t scheduleP2pTasksToPlan(
|
||||
int const *recvOrder = tasks->p2pRecvOrder;
|
||||
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
|
||||
if (!plan->kernelSpecialized) {
|
||||
plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
|
||||
plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
|
||||
}
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
if (comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
|
||||
if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize;
|
||||
// Try to use all channels
|
||||
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
||||
int nChannelsMin = nChannelsMax;
|
||||
@@ -714,7 +680,6 @@ static inline uint32_t rollingMin32(uint32_t a, uint32_t b) {
|
||||
// Spin until its safe to increase comm->workFifoSent to desiredSent.
|
||||
static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
|
||||
if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) {
|
||||
uint64_t t0 = clockNano();
|
||||
while (1) {
|
||||
// We have to poll for notifications from device.
|
||||
uint32_t* doneLive = comm->workFifoDone;
|
||||
@@ -747,8 +712,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
|
||||
|
||||
// See if that was enough.
|
||||
if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break;
|
||||
// Nope. Maintain vigorous spin for first 5us, then start yielding.
|
||||
if (clockNano()-t0 >= 5*1000) sched_yield();
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -874,10 +838,10 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
|
||||
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
|
||||
if (plan->persistent) {
|
||||
comm->persistentRefs -= 1;
|
||||
if (!ncclMainExited) NCCLCHECK(ncclCudaFree(plan->workHead));
|
||||
NCCLCHECK(ncclCudaFree(plan->workHead));
|
||||
while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) {
|
||||
struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue);
|
||||
if (!ncclMainExited) CUDACHECKIGNORE(hipIpcCloseMemHandle(q->ptr));
|
||||
CUDACHECKIGNORE(hipIpcCloseMemHandle(q->ptr));
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
|
||||
}
|
||||
}
|
||||
@@ -904,7 +868,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
|
||||
// Poll for callbacks sent to us from other threads. Typically these free
|
||||
// resources from to our memory pools.
|
||||
NCCLCHECK(ncclCommPollCallbacks(comm));
|
||||
NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));
|
||||
|
||||
// We already have one frame present which holds all of our tasks (which we
|
||||
// are about to schedule). Now push an additional frame for allocating
|
||||
@@ -1082,7 +1046,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
||||
info->protocol = -1;
|
||||
int nAlgos = NCCL_NUM_ALGORITHMS;
|
||||
for (int a=0; a<nAlgos; a++) {
|
||||
if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
|
||||
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float time;
|
||||
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
|
||||
@@ -1104,12 +1068,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
||||
int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
|
||||
int nt = comm->maxThreads[info->algorithm][info->protocol];
|
||||
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET) {
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
// CollNet channel tuning
|
||||
int ncSwitch = 16;
|
||||
bool flag = true;
|
||||
while (ncSwitch >= 1 && flag) {
|
||||
while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) {
|
||||
while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) {
|
||||
if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
|
||||
nc--;
|
||||
}
|
||||
@@ -1133,7 +1097,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
||||
nt += WARP_SIZE; // Extra warp for sync
|
||||
// More threads or sync warps needed due to split thread model
|
||||
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
|
||||
}
|
||||
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
|
||||
#endif
|
||||
@@ -1180,7 +1145,11 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
case ncclFuncAllToAllPivot:
|
||||
info->pattern = ncclPatternRing; break;
|
||||
case ncclFuncAllReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
|
||||
info->pattern =
|
||||
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
|
||||
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
|
||||
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
|
||||
ncclPatternRingTwice; break;
|
||||
default:
|
||||
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
|
||||
return ncclInternalError;
|
||||
@@ -1195,9 +1164,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
case ncclPatternTreeUpDown:
|
||||
case ncclPatternPipelineFrom:
|
||||
case ncclPatternPipelineTo:
|
||||
case ncclPatternCollnetChain:
|
||||
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
||||
case ncclPatternCollTreeUpDown:
|
||||
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break;
|
||||
case ncclPatternCollnetDirect:
|
||||
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
|
||||
case ncclPatternRing:
|
||||
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
case ncclPatternRingTwice:
|
||||
@@ -1274,15 +1244,22 @@ comp_next:
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*64 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
// Set direct direction for broadcast-gather (read or write)
|
||||
work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
|
||||
stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
chunkSize = std::min(256*1024, stepSize*chunkSteps);
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*64 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
@@ -1311,7 +1288,7 @@ comp_next:
|
||||
proxyOp->chunkSize = chunkSize;
|
||||
proxyOp->protocol = info->protocol;
|
||||
proxyOp->dtype = info->datatype;
|
||||
proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
|
||||
proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet
|
||||
info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
|
||||
info->op;
|
||||
proxyOp->pattern = info->pattern;
|
||||
@@ -1514,30 +1491,43 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int devOld = -1;
|
||||
NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, end0);
|
||||
|
||||
NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail);
|
||||
// Check whether communicator is ready to communicate
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
|
||||
|
||||
if (info->comm->checkPointers) {
|
||||
CUDACHECKGOTO(hipGetDevice(&devOld), ret, end0);
|
||||
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end0);
|
||||
CUDACHECKGOTO(hipGetDevice(&devOld), ret, fail);
|
||||
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ArgsCheck(info), ret, end1);
|
||||
NCCLCHECKGOTO(ArgsCheck(info), ret, fail);
|
||||
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
|
||||
|
||||
NCCLCHECKGOTO(taskAppend(info->comm, info), ret, end1);
|
||||
NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
|
||||
|
||||
end1:
|
||||
if (devOld != -1) CUDACHECKGOTO(hipSetDevice(devOld), ret, end0);
|
||||
end0:
|
||||
exit:
|
||||
if (devOld != -1) CUDACHECK(hipSetDevice(devOld));
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
|
||||
* so we have to check state here. */
|
||||
if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
|
||||
return ret;
|
||||
fail:
|
||||
if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
|
||||
/* join init thread before creating PreMulSum op. */
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
if (comm->userRedOpFreeHead == comm->userRedOpCapacity) {
|
||||
// double capacity and resize
|
||||
int cap = 2*comm->userRedOpCapacity;
|
||||
|
||||
+20
-15
@@ -48,13 +48,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
channel->tree.up = -1;
|
||||
channel->collnetChain.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
|
||||
channel->collTree.out = -1;
|
||||
channel->collTree.headRank = -1;
|
||||
channel->collTree.nHeads = 0;
|
||||
channel->collTree.shift = 0;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
|
||||
channel->collnetDirect.out = -1;
|
||||
channel->collnetDirect.headRank = -1;
|
||||
channel->collnetDirect.nHeads = 0;
|
||||
channel->collnetDirect.shift = 0;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
|
||||
|
||||
int* ringIntra = ringGraph->intra+c*localRanks;
|
||||
int* treeIntra = treeGraph->intra+c*localRanks;
|
||||
@@ -76,6 +78,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
topoRanks->treeToChild1[c] = treeIntra[child1Index];
|
||||
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
|
||||
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
|
||||
channel->collnetChain.up = i == 0 ? comm->nRanks : treeIntra[i-1];
|
||||
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
@@ -571,13 +575,13 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int nDown = 0;
|
||||
for (int i=0; i<nHeads; i++) {
|
||||
if (rank == heads[i]) { // is head
|
||||
channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
|
||||
channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
|
||||
channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
|
||||
channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
|
||||
int* collNetIntra = collNetGraph->intra+i*localRanks;
|
||||
sprintf(line+strlen(line), "down ");
|
||||
for (int r=0; r<localRanks; r++) {
|
||||
if (collNetIntra[r] == rank) continue;
|
||||
channel->collTree.down[nDown++] = collNetIntra[r]; // connect to all peers
|
||||
channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
|
||||
sprintf(line+strlen(line), " %d ", collNetIntra[r]);
|
||||
}
|
||||
sprintf(line+strlen(line), "nDown %d ", nDown);
|
||||
@@ -589,15 +593,16 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
sprintf(line+strlen(line), "up ");
|
||||
for (int h=0; h<nHeads; h++) {
|
||||
if (rank == heads[h]) continue;
|
||||
channel->collTree.up[nUp++] = heads[h];
|
||||
channel->collnetDirect.up[nUp++] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
channel->collTree.nHeads = nHeads;
|
||||
channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
channel->collnetDirect.nHeads = nHeads;
|
||||
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
|
||||
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
|
||||
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
|
||||
}
|
||||
free(heads);
|
||||
return ncclSuccess;
|
||||
@@ -685,7 +690,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
// Setup CollNet
|
||||
if (comm->collNetSupport == 1) {
|
||||
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
|
||||
if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
|
||||
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
|
||||
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
|
||||
}
|
||||
|
||||
+77
-26
@@ -45,7 +45,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
struct ncclTopoLinkList* basePath;
|
||||
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
||||
basePath->count = 0;
|
||||
basePath->width = LOC_WIDTH;
|
||||
basePath->bw = LOC_BW;
|
||||
basePath->type = PATH_LOC;
|
||||
|
||||
while (nodeList.count) {
|
||||
@@ -62,8 +62,13 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
}
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
float width = std::min(path->width, link->width);
|
||||
if (remPath->width < width) {
|
||||
float bw = std::min(path->bw, link->bw);
|
||||
|
||||
// allow routing through a GPU only as 1 hop
|
||||
if (node != baseNode && node->type == GPU &&
|
||||
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
||||
|
||||
if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
@@ -79,7 +84,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
// Copy the rest of the path
|
||||
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
|
||||
remPath->count = path->count + 1;
|
||||
remPath->width = width;
|
||||
remPath->bw = bw;
|
||||
|
||||
// Start with path type = link type. PATH and LINK types are supposed to match.
|
||||
// Don't consider LINK_NET as we only care about the NIC->GPU path.
|
||||
@@ -128,9 +133,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
|
||||
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
|
||||
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
|
||||
#else
|
||||
sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
|
||||
sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
|
||||
offset = strlen(line);
|
||||
#endif
|
||||
}
|
||||
@@ -184,7 +189,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
|
||||
srcNode->paths[t2][i2].count = l;
|
||||
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
|
||||
if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
|
||||
srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
|
||||
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -419,6 +424,40 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
|
||||
|
||||
// Check whether going through the network would be faster than going through P2P/SHM.
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
|
||||
if (ncclParamNetDisableIntra() == 1) {
|
||||
*net = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
*net = 1;
|
||||
// First check the current GPU-to-GPU speed.
|
||||
int g1, g2;
|
||||
if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
|
||||
ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
||||
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
||||
float speed = gpu1->paths[GPU][g2].bw;
|
||||
|
||||
// Now check the speed each GPU can access the network through PXB or better
|
||||
float netSpeed1 = 0, netSpeed2 = 0;
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoLinkList* path = gpu1->paths[NET]+n;
|
||||
if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw;
|
||||
path = gpu2->paths[NET]+n;
|
||||
if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw;
|
||||
}
|
||||
|
||||
if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess;
|
||||
*net = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
|
||||
// Get GPU and NET
|
||||
int n, g;
|
||||
@@ -496,17 +535,23 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
// Remove everything in case we're re-computing
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
|
||||
// Set direct paths from/to CPUs. We need them in many cases.
|
||||
// Set direct paths to CPUs. We need them in many cases.
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
|
||||
}
|
||||
|
||||
// Set direct paths from/to GPUs.
|
||||
// Set direct paths to GPUs.
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Compute paths to GPU g
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
|
||||
}
|
||||
|
||||
// Update path when we don't want to / can't use GPU Direct P2P
|
||||
// Set direct paths to NICs.
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
|
||||
}
|
||||
|
||||
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
|
||||
@@ -519,31 +564,32 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
}
|
||||
|
||||
if (comm == NULL) continue;
|
||||
// Remove GPUs we can't talk to because of containers.
|
||||
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
|
||||
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
if (p == g) continue;
|
||||
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
|
||||
int shm;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
|
||||
if (shm == 0 && p2p == 0) {
|
||||
// Mark this peer as inaccessible. We'll trim it later.
|
||||
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
|
||||
if (p2p == 0) {
|
||||
int shm;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
if (shm == 0) {
|
||||
// Mark this peer as inaccessible. We'll trim it later.
|
||||
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set direct paths from/to NICs.
|
||||
// Update paths for NICs (no GPU Direct, PXN, ...)
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
|
||||
NCCLCHECK(ncclTopoSetPaths(netNode, system));
|
||||
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
||||
if (ncclPxnDisable(comm) != 1) {
|
||||
int pxnGpu = -1;
|
||||
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
@@ -551,7 +597,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
|
||||
// PXN = PCI + NVLink.
|
||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
|
||||
if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
|
||||
// Only use PXN for NIC n if remote GPU p ...
|
||||
if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI
|
||||
peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink
|
||||
(peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC
|
||||
gpu->paths[NET][n].type <= PATH_PXB)) // or avoids going through a CPU
|
||||
continue;
|
||||
|
||||
pxnGpu = p;
|
||||
|
||||
@@ -630,15 +681,15 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
for (int j = 0; j < system->nodes[NET].count; j ++) {
|
||||
if (i == j) continue;
|
||||
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
|
||||
if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[j].net.width)
|
||||
system->nodes[NET].nodes[j].net.width = 0;
|
||||
if (system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
|
||||
system->nodes[NET].nodes[j].net.bw = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
do {
|
||||
int n;
|
||||
for (n=0; n<system->nodes[NET].count; n++) {
|
||||
if (system->nodes[NET].nodes[n].net.width == 0) break;
|
||||
if (system->nodes[NET].nodes[n].net.bw == 0) break;
|
||||
}
|
||||
if (n<system->nodes[NET].count) {
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
@@ -716,8 +767,8 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
|
||||
// Local rank
|
||||
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
||||
if (path->type == PATH_NVL) {
|
||||
float nvlWidth = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
|
||||
float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
|
||||
*nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
|
||||
@@ -721,7 +721,7 @@ newchannel:
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
graph->nChannels = nChannels;
|
||||
graph->speedIntra = graph->speedInter = system->maxWidth;
|
||||
graph->bwIntra = graph->bwInter = system->maxBw;
|
||||
if (graph->id == 1) {
|
||||
for (int i=0; i<graph->nChannels; i++) {
|
||||
int net;
|
||||
@@ -1031,7 +1031,7 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type != LINK_NVL) continue;
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = link->width/ncclTopoXGMISpeed(node->gpu.gcn);
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = link->bw/ncclTopoXGMISpeed(node->gpu.gcn);
|
||||
count ++;
|
||||
}
|
||||
if (romeTopo->nLinks < count) romeTopo->nLinks = count;
|
||||
|
||||
+88
-88
@@ -13,39 +13,39 @@
|
||||
#include <sys/time.h>
|
||||
#include "rome_models.h"
|
||||
|
||||
// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
|
||||
// max speed.
|
||||
static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
|
||||
float maxWidth = 0.0;
|
||||
// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
|
||||
// max bw.
|
||||
static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
|
||||
float maxBw = 0.0;
|
||||
for (int i=0; i<system->nodes[type].count; i++) {
|
||||
struct ncclTopoLinkList* path = gpu->paths[type]+i;
|
||||
float width = path->width;
|
||||
float bw = path->bw;
|
||||
if (path->count == 0) continue;
|
||||
maxWidth = std::max(maxWidth, width);
|
||||
maxBw = std::max(maxBw, bw);
|
||||
}
|
||||
return maxWidth;
|
||||
return maxBw;
|
||||
}
|
||||
static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
|
||||
float nvlinkWidth = 0.0, pciWidth = 0.0;
|
||||
static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
|
||||
float nvlinkBw = 0.0, pciBw = 0.0;
|
||||
for (int l=0; l<gpu->nlinks; l++) {
|
||||
struct ncclTopoLink* link = gpu->links+l;
|
||||
if (link->type == LINK_NVL) nvlinkWidth += link->width;
|
||||
if (link->type == LINK_PCI) pciWidth = link->width;
|
||||
if (link->type == LINK_NVL) nvlinkBw += link->bw;
|
||||
if (link->type == LINK_PCI) pciBw = link->bw;
|
||||
}
|
||||
return std::max(pciWidth, nvlinkWidth);
|
||||
return std::max(pciBw, nvlinkBw);
|
||||
}
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
system->maxWidth = 0.0;
|
||||
system->totalWidth = 0.0;
|
||||
system->maxBw = 0.0;
|
||||
system->totalBw = 0.0;
|
||||
int inter = system->nodes[NET].count;
|
||||
if (inter == 0 && system->nodes[GPU].count == 1) {
|
||||
system->maxWidth = LOC_WIDTH;
|
||||
system->maxBw = LOC_BW;
|
||||
return ncclSuccess;
|
||||
}
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
|
||||
system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
|
||||
system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU));
|
||||
system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -65,8 +65,8 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
|
||||
// This is unfortunately needed since manipulating floats often results in rounding errors.
|
||||
#define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000)
|
||||
|
||||
static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) {
|
||||
float pciSpeed = speed;
|
||||
static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) {
|
||||
float pciBw = bw;
|
||||
for (int step=0; step<path->count; step++) {
|
||||
struct ncclTopoNode* node = path->list[step]->remNode;
|
||||
if (node->type == CPU) {
|
||||
@@ -74,7 +74,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
|
||||
if (path->type == PATH_PHB && start->type == GPU &&
|
||||
node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
|
||||
node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
pciSpeed = INTEL_P2P_OVERHEAD(speed);
|
||||
pciBw = INTEL_P2P_OVERHEAD(bw);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,19 +83,19 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
|
||||
for (int step=0; step<maxSteps; step++) {
|
||||
struct ncclTopoLink* link = path->list[step];
|
||||
struct ncclTopoLink* revLink = NULL;
|
||||
float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
|
||||
float revSpeed = 0;
|
||||
float fwBw = link->type == LINK_PCI ? pciBw : bw;
|
||||
float revBw = 0;
|
||||
if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
|
||||
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
|
||||
revSpeed += fwSpeed/8;
|
||||
revBw += fwBw/8;
|
||||
}
|
||||
if (link->remNode->type == CPU && link->type == LINK_NVL) {
|
||||
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
|
||||
revSpeed += fwSpeed;
|
||||
revBw += fwBw;
|
||||
}
|
||||
if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; }
|
||||
SUB_ROUND(link->width, fwSpeed);
|
||||
if (revSpeed) SUB_ROUND(revLink->width, revSpeed);
|
||||
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
|
||||
SUB_ROUND(link->bw, fwBw);
|
||||
if (revBw) SUB_ROUND(revLink->bw, revBw);
|
||||
node = link->remNode;
|
||||
}
|
||||
*steps = maxSteps;
|
||||
@@ -114,16 +114,16 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
|
||||
// Now check link type
|
||||
*node = NULL;
|
||||
int intra = type1 == GPU && type2 == GPU;
|
||||
float speed = intra ? graph->speedIntra : graph->speedInter;
|
||||
float bw = intra ? graph->bwIntra : graph->bwInter;
|
||||
int type = intra ? graph->typeIntra : graph->typeInter;
|
||||
|
||||
if (mult == 1 && (path->type > type)) return ncclSuccess;
|
||||
|
||||
speed *= mult;
|
||||
bw *= mult;
|
||||
|
||||
// Check there is enough bandwidth on paths.
|
||||
int step = 0;
|
||||
NCCLCHECK(followPath(path, node1, path->count, speed, &step));
|
||||
NCCLCHECK(followPath(path, node1, path->count, bw, &step));
|
||||
if (step < path->count) goto rewind;
|
||||
|
||||
// Enough bandwidth : return destination node.
|
||||
@@ -133,11 +133,11 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
|
||||
|
||||
rewind:
|
||||
// Not enough bandwidth : rewind and exit.
|
||||
NCCLCHECK(followPath(path, node1, step, -speed, &step));
|
||||
NCCLCHECK(followPath(path, node1, step, -bw, &step));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int gpuPciWidth(struct ncclTopoNode* gpu) {
|
||||
static int gpuPciBw(struct ncclTopoNode* gpu) {
|
||||
for (int l=0; l<gpu->nlinks; l++) {
|
||||
struct ncclTopoLink* gpuLink = gpu->links+l;
|
||||
if (gpuLink->type != LINK_PCI) continue;
|
||||
@@ -145,7 +145,7 @@ static int gpuPciWidth(struct ncclTopoNode* gpu) {
|
||||
for (int l=0; l<pci->nlinks; l++) {
|
||||
struct ncclTopoLink* pciLink = pci->links+l;
|
||||
if (pciLink->remNode != gpu) continue;
|
||||
return std::min(gpuLink->width, pciLink->width);
|
||||
return std::min(gpuLink->bw, pciLink->bw);
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
@@ -157,29 +157,29 @@ struct ncclGpuScore {
|
||||
int g; // Retain the index
|
||||
int startIndex; // Least important
|
||||
int intraNhops;
|
||||
int intraWidth;
|
||||
int intraBw;
|
||||
int interNhops;
|
||||
int interPciWidth;
|
||||
int interWidth; // Most important
|
||||
int interPciBw;
|
||||
int interBw; // Most important
|
||||
};
|
||||
|
||||
static int cmpScore(const void * g1, const void * g2) {
|
||||
struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
|
||||
struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
|
||||
int d;
|
||||
if ((d = (s2->interWidth - s1->interWidth))) return d;
|
||||
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
|
||||
if ((d = (s2->interBw - s1->interBw))) return d;
|
||||
if ((d = (s2->interPciBw - s1->interPciBw))) return d;
|
||||
if ((d = (s1->interNhops - s2->interNhops))) return d;
|
||||
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
|
||||
if ((d = (s2->intraBw - s1->intraBw))) return d;
|
||||
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
|
||||
return s1->startIndex - s2->startIndex;
|
||||
}
|
||||
|
||||
static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
|
||||
int intraWidth = scores[0].intraWidth;
|
||||
int intraBw = scores[0].intraBw;
|
||||
int intraNhops = scores[0].intraNhops;
|
||||
for (int i=1; i<count; i++) {
|
||||
if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
|
||||
if (scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -234,11 +234,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
scores[count].g = g;
|
||||
scores[count].startIndex = i;
|
||||
scores[count].intraNhops = paths[g].count;
|
||||
scores[count].intraWidth = paths[g].width;
|
||||
scores[count].intraBw = paths[g].bw;
|
||||
if (netPaths) {
|
||||
scores[count].interNhops = netPaths[g].count;
|
||||
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
|
||||
scores[count].interWidth = netPaths[g].width;
|
||||
scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes+g);
|
||||
scores[count].interBw = netPaths[g].bw;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
@@ -341,8 +341,8 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
if (graph->nChannels < graph->minChannels) return ncclSuccess;
|
||||
|
||||
// 2. Try to get better bandwidth
|
||||
if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
|
||||
if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
|
||||
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -446,23 +446,23 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
float speedInterSave = graph->speedInter;
|
||||
float bwInterSave = graph->bwInter;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
|
||||
// Count half of the bandwidth on each of the first two GPUs
|
||||
if (step == 0) nextBackToNet = 1;
|
||||
else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
|
||||
graph->speedInter /= 2;
|
||||
graph->bwInter /= 2;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
graph->bwInter = bwInterSave;
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
graph->bwInter = bwInterSave;
|
||||
}
|
||||
}
|
||||
free(nets);
|
||||
@@ -501,7 +501,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
|
||||
const int speed = graph->speedInter;
|
||||
const int bw = graph->bwInter;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
int netcount;
|
||||
@@ -511,7 +511,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoNode* gpu;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
if (net->net.width < speed) continue;
|
||||
if (net->net.bw < bw) continue;
|
||||
if (net->net.maxChannels == 0) continue;
|
||||
|
||||
graph->inter[graph->nChannels*2] = net->id;
|
||||
@@ -520,7 +520,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
|
||||
(system->nodes[NET].nodes[i].net.port == net->net.port)) {
|
||||
system->nodes[NET].nodes[i].net.width -= speed;
|
||||
system->nodes[NET].nodes[i].net.bw -= bw;
|
||||
}
|
||||
}
|
||||
net->net.maxChannels--;
|
||||
@@ -554,26 +554,26 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
float maxWidth = 0;
|
||||
float maxBw = 0;
|
||||
int minHops = 0xfffffff;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].width > maxWidth) {
|
||||
maxWidth = paths[g].width;
|
||||
if (paths[g].bw > maxBw) {
|
||||
maxBw = paths[g].bw;
|
||||
minHops = paths[g].count;
|
||||
} else if (paths[g].width == maxWidth && paths[g].count < minHops) {
|
||||
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
|
||||
minHops = paths[g].count;
|
||||
}
|
||||
}
|
||||
if (maxWidth >= speed) {
|
||||
if (maxBw >= bw) {
|
||||
// In the first loop, avoid using GPUs in both directions between channels (one channel
|
||||
// sending from that GPU and one channel receiving to that GPU), since that usually leads
|
||||
// to lower BW.
|
||||
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].width == maxWidth && paths[g].count == minHops) {
|
||||
if (paths[g].bw == maxBw && paths[g].count == minHops) {
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
|
||||
int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
|
||||
if (tryGpuBidir == gpuUsed) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
|
||||
}
|
||||
@@ -587,7 +587,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
|
||||
(system->nodes[NET].nodes[i].net.port == net->net.port)) {
|
||||
system->nodes[NET].nodes[i].net.width += speed;
|
||||
system->nodes[NET].nodes[i].net.bw += bw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -706,8 +706,8 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
|
||||
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
|
||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
|
||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
|
||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
|
||||
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
|
||||
if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
|
||||
@@ -767,8 +767,8 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop
|
||||
NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
|
||||
NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
|
||||
NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
|
||||
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
|
||||
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
|
||||
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra));
|
||||
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter));
|
||||
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
|
||||
const char* str;
|
||||
NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
|
||||
@@ -830,7 +830,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
graph->crossNic = ncclParamCrossNic();
|
||||
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
|
||||
graph->speedIntra = graph->speedInter = 0;
|
||||
graph->bwIntra = graph->bwInter = 0;
|
||||
graph->latencyInter = 0;
|
||||
if (graph->crossNic == 2) graph->crossNic = 0;
|
||||
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
|
||||
@@ -903,7 +903,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
|
||||
// First try crossnic, then decrease speed and finally increase speedIntra.
|
||||
// First try crossnic, then decrease bw and finally increase bwIntra.
|
||||
int nspeeds = 0;
|
||||
float* speedArray = NULL;
|
||||
if (system->nodes[NET].count == 0) {
|
||||
@@ -915,8 +915,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
}
|
||||
int pass = 1;
|
||||
int speedIndex = 0;
|
||||
while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
|
||||
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
|
||||
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
|
||||
search:
|
||||
int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
|
||||
@@ -926,7 +926,7 @@ search:
|
||||
|
||||
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
|
||||
#if 0
|
||||
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
|
||||
printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
printf("%2d : ", c);
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
@@ -938,7 +938,7 @@ search:
|
||||
#endif
|
||||
// Optimal solution, stop here
|
||||
if (time == -1) goto done;
|
||||
if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
|
||||
if (graph->nChannels*graph->bwInter >= system->totalBw) goto done;
|
||||
|
||||
if (pass == 1) {
|
||||
// First pass, we don't have a solution yet ; try other options
|
||||
@@ -980,14 +980,14 @@ search:
|
||||
}
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
|
||||
// Decrease speed until we find a solution
|
||||
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
|
||||
tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
|
||||
// Decrease bw until we find a solution
|
||||
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
|
||||
tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
|
||||
goto search;
|
||||
}
|
||||
speedIndex = 0;
|
||||
while (speedArray[speedIndex] > system->maxWidth && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
|
||||
while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
|
||||
|
||||
}
|
||||
|
||||
@@ -997,18 +997,18 @@ done:
|
||||
time = -1;
|
||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||
speedIndex = 0;
|
||||
while (speedArray[speedIndex] > graph->speedInter && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
|
||||
while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
|
||||
tmpGraph.minChannels = graph->nChannels;
|
||||
pass = 2;
|
||||
}
|
||||
|
||||
// 3. See if we can increase speedIntra for trees (2 nodes or collnet)
|
||||
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
|
||||
if (pass == 2) {
|
||||
if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
|
||||
tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
|
||||
tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
|
||||
speedIndex > 0) {
|
||||
tmpGraph.speedIntra = speedArray[--speedIndex];
|
||||
tmpGraph.bwIntra = speedArray[--speedIndex];
|
||||
goto search;
|
||||
}
|
||||
time = -1;
|
||||
@@ -1019,17 +1019,17 @@ done:
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
|
||||
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank[0];
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->speedIntra = graph->speedInter = 0.1;
|
||||
graph->bwIntra = graph->bwInter = 0.1;
|
||||
graph->typeIntra = graph->typeInter = PATH_SYS;
|
||||
graph->nChannels = 1;
|
||||
}
|
||||
if (graph->speedIntra >= 25.0) {
|
||||
|
||||
if (graph->bwIntra >= 25.0) {
|
||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||
memcpy(graph->intraNets+graph->nChannels*ngpus*2, graph->intraNets, (dupChannels-graph->nChannels)*2*ngpus*sizeof(int));
|
||||
graph->speedIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->speedInter /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->nChannels = dupChannels;
|
||||
}
|
||||
ncclExpandMultiRank(system, graph);
|
||||
@@ -1037,7 +1037,7 @@ done:
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
|
||||
INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
|
||||
char line[1024];
|
||||
|
||||
+37
-32
@@ -62,24 +62,24 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int interCpuWidth = 0;
|
||||
int cpuPciWidth = 0;
|
||||
int interCpuBw = 0;
|
||||
int cpuPciBw = 0;
|
||||
|
||||
static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
|
||||
*width = LOC_WIDTH;
|
||||
static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
|
||||
*bw = LOC_BW;
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
|
||||
*width = P9_WIDTH;
|
||||
*bw = P9_BW;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
|
||||
*width = ARM_WIDTH;
|
||||
*bw = ARM_BW;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
|
||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
|
||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -115,7 +115,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
|
||||
n->nlinks=1;
|
||||
n->links[0].type = LINK_LOC;
|
||||
n->links[0].remNode = n;
|
||||
n->links[0].width = LOC_WIDTH;
|
||||
n->links[0].bw = LOC_BW;
|
||||
n->gpu.dev = NCCL_TOPO_UNDEF;
|
||||
for (int i=0; i<RCCL_TOPO_MAX_RANKS_PER_GPU; i++) {
|
||||
n->gpu.rank[i] = NCCL_TOPO_UNDEF;
|
||||
@@ -129,7 +129,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
|
||||
} else if (type == NET) {
|
||||
n->net.asic = 0ULL;
|
||||
n->net.port = NCCL_TOPO_UNDEF;
|
||||
n->net.width = 0.0;
|
||||
n->net.bw = 0.0;
|
||||
n->net.latency = 0.0;
|
||||
}
|
||||
*node = n;
|
||||
@@ -159,8 +159,8 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int ind
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
|
||||
// Aggregate links into higher width for NVLink
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
|
||||
// Aggregate links into higher bw for NVLink
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode == remNode && link->type == type) break;
|
||||
@@ -168,13 +168,13 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
if (link->remNode == NULL) node->nlinks++;
|
||||
link->type = type;
|
||||
link->remNode = remNode;
|
||||
link->width += width;
|
||||
link->bw += bw;
|
||||
|
||||
// Sort links in BW descending order
|
||||
struct ncclTopoLink linkSave;
|
||||
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
|
||||
while (link != node->links) {
|
||||
if ((link-1)->width >= linkSave.width) break;
|
||||
if ((link-1)->bw >= linkSave.bw) break;
|
||||
memcpy(link, link-1, sizeof(struct ncclTopoLink));
|
||||
link--;
|
||||
}
|
||||
@@ -246,9 +246,9 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
|
||||
for (int n=0; n<system->nodes[CPU].count; n++) {
|
||||
for (int p=0; p<system->nodes[CPU].count; p++) {
|
||||
if (n == p) continue;
|
||||
float width;
|
||||
NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
|
||||
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
|
||||
float bw;
|
||||
NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -279,13 +279,13 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
if (link->type == LINK_LOC) continue;
|
||||
if (link->type != LINK_PCI || link->remNode != prevNode) {
|
||||
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
|
||||
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
|
||||
int nextOffset = strlen(line);
|
||||
if (link->type == LINK_PCI) {
|
||||
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
|
||||
} else {
|
||||
if (link->remNode->type == NET) {
|
||||
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
|
||||
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
|
||||
} else {
|
||||
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
|
||||
}
|
||||
@@ -297,7 +297,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
|
||||
INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
|
||||
char line[1024];
|
||||
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
|
||||
INFO(NCCL_GRAPH, "==========================================");
|
||||
@@ -352,7 +352,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
|
||||
int mbps;
|
||||
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
|
||||
if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
|
||||
net->net.width = mbps / 8000.0;
|
||||
net->net.bw = mbps / 8000.0;
|
||||
if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
|
||||
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
|
||||
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
|
||||
@@ -361,8 +361,8 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
|
||||
net->net.busId = busId;
|
||||
ncclDebugNoWarn = 0;
|
||||
|
||||
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
|
||||
NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
|
||||
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -520,8 +520,8 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
|
||||
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
|
||||
if (nic == NULL) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
|
||||
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
|
||||
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
|
||||
}
|
||||
NCCLCHECK(ncclTopoAddNic(node, system, nic, 0));
|
||||
}
|
||||
@@ -616,10 +616,10 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
}
|
||||
}
|
||||
if (remote) {
|
||||
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
|
||||
float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap);
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw));
|
||||
if (remote->type != GPU) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
|
||||
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -781,18 +781,18 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i
|
||||
int g;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
int minType = PATH_SYS;
|
||||
float maxWidth = 0;
|
||||
float maxBw = 0;
|
||||
int count = 0;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
|
||||
if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) {
|
||||
maxWidth = path->width;
|
||||
if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
|
||||
maxBw = path->bw;
|
||||
minType = path->type;
|
||||
count = 0;
|
||||
}
|
||||
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
|
||||
if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
|
||||
}
|
||||
if (count == 0) {
|
||||
*id = -1;
|
||||
@@ -890,6 +890,11 @@ ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) {
|
||||
*count = system->nodes[NVS].count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
|
||||
if (system->nodes[GPU].count == 0) return ncclInternalError;
|
||||
int min, max;
|
||||
|
||||
+20
-20
@@ -11,25 +11,25 @@
|
||||
#include "graph.h"
|
||||
#include "core.h"
|
||||
|
||||
#define LOC_WIDTH 5000.0
|
||||
#define SM60_NVLINK_WIDTH 18.0
|
||||
#define SM70_NVLINK_WIDTH 22.0
|
||||
#define SM80_NVLINK_WIDTH 22.0
|
||||
#define SM86_NVLINK_WIDTH 12.0
|
||||
#define PCI_WIDTH 12.0 // PCI Gen3 x16
|
||||
#define QPI_WIDTH 6.0
|
||||
#define SKL_QPI_WIDTH 9.0
|
||||
#define ZPI_WIDTH 6.0
|
||||
#define YONGFENG_ZPI_WIDTH 9.0
|
||||
#define P9_WIDTH 32.0
|
||||
#define ARM_WIDTH 6.0
|
||||
#define NET_WIDTH 12.0 // 100Gbit
|
||||
#define LOC_BW 5000.0
|
||||
#define SM60_NVLINK_BW 18.0
|
||||
#define SM70_NVLINK_BW 22.0
|
||||
#define SM80_NVLINK_BW 22.0
|
||||
#define SM86_NVLINK_BW 12.0
|
||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||
#define QPI_BW 6.0
|
||||
#define SKL_QPI_BW 9.0
|
||||
#define ZPI_BW 6.0
|
||||
#define YONGFENG_ZPI_BW 9.0
|
||||
#define P9_BW 32.0
|
||||
#define ARM_BW 6.0
|
||||
#define NET_BW 12.0 // 100Gbit
|
||||
#define VEGA_XGMI_WIDTH 24.0
|
||||
#define MI200_XGMI_WIDTH 36.0
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)
|
||||
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
|
||||
|
||||
#define NCCL_TOPO_NODE_TYPES 7
|
||||
#define GPU 0
|
||||
@@ -81,7 +81,7 @@ extern const char* topoPathTypeStr[];
|
||||
struct ncclTopoNode;
|
||||
struct ncclTopoLink {
|
||||
int type;
|
||||
float width;
|
||||
float bw;
|
||||
struct ncclTopoNode* remNode;
|
||||
};
|
||||
#define NCCL_TOPO_MAX_LINKS 32
|
||||
@@ -90,7 +90,7 @@ struct ncclTopoLink {
|
||||
struct ncclTopoLinkList {
|
||||
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
|
||||
int count;
|
||||
float width;
|
||||
float bw;
|
||||
int type;
|
||||
};
|
||||
|
||||
@@ -124,7 +124,7 @@ struct ncclTopoNode {
|
||||
struct {
|
||||
uint64_t asic;
|
||||
int port;
|
||||
float width;
|
||||
float bw;
|
||||
float latency;
|
||||
int gdrSupport;
|
||||
int collSupport;
|
||||
@@ -156,8 +156,8 @@ struct ncclTopoNodeSet {
|
||||
|
||||
struct ncclTopoSystem {
|
||||
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
||||
float maxWidth;
|
||||
float totalWidth;
|
||||
float maxBw;
|
||||
float totalBw;
|
||||
int type;
|
||||
int nRanks;
|
||||
int netGdrLevel;
|
||||
@@ -172,7 +172,7 @@ struct ncclTopoSystem {
|
||||
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
|
||||
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
|
||||
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
|
||||
|
||||
+66
-46
@@ -71,18 +71,18 @@ struct tuningModel {
|
||||
static struct tuningModel tuning_model_0 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 28.3, 28.3, 45.4 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 24.1 }, /* CollNet (LL/LL128/Simple)*/ { 28.3, 28.3, 45.4 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 28.3, 28.3, 45.4 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 24.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 45.4 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 45.4 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 1.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 1.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 0.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 0.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
@@ -101,18 +101,18 @@ static struct tuningModel tuning_model_0 {
|
||||
static struct tuningModel tuning_model_1 {
|
||||
.hwLat =
|
||||
{ /* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNet (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 } },
|
||||
},
|
||||
|
||||
.bwRatio =
|
||||
{ /* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
@@ -131,18 +131,18 @@ static struct tuningModel tuning_model_1 {
|
||||
static struct tuningModel tuning_model_2 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNet (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
@@ -161,18 +161,18 @@ static struct tuningModel tuning_model_2 {
|
||||
static struct tuningModel tuning_model_3 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 17.4, 17.4, 40.3 }, /* Ring (LL/LL128/Simple)*/ { 4.1, 4.1, 40.6 }, /* CollNet (LL/LL128/Simple)*/ { 17.4, 17.4, 40.3 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 17.4, 17.4, 40.3 }, /* Ring (LL/LL128/Simple)*/ { 4.1, 4.1, 40.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 40.3 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 40.3 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.41 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.41 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
@@ -191,18 +191,18 @@ static struct tuningModel tuning_model_3 {
|
||||
static struct tuningModel tuning_model_4 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNet (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNet (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
@@ -232,24 +232,27 @@ static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0,
|
||||
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8} };
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), 4*comm->WarpSize, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
#else
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
|
||||
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
||||
#endif
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
#endif
|
||||
|
||||
int nNodes = comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
@@ -267,7 +270,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
//if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph };
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
@@ -284,8 +287,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if ((coll != ncclFuncAllReduce) && a != NCCL_ALGO_RING) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * speed;
|
||||
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
||||
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
||||
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
|
||||
|
||||
// Various model refinements
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
@@ -300,8 +304,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
factor -= (factor-1)/2;
|
||||
busBw /= factor;
|
||||
}
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
|
||||
#endif
|
||||
if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
|
||||
@@ -309,7 +321,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
|
||||
float interLat = rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
|
||||
float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
|
||||
//if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
|
||||
@@ -326,9 +338,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
} else {
|
||||
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency
|
||||
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
|
||||
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -337,7 +351,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
// Protocols/Algorithms enable/disable, and user overrides.
|
||||
// All are enabled except ll128 which is enabled by default only in certain cases.
|
||||
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 };
|
||||
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
if (protoStr) {
|
||||
@@ -351,12 +365,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
} else {
|
||||
// Disable CollNet+Direct if not on an NVSwitch system
|
||||
int nvsCount = 0;
|
||||
NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
|
||||
if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
}
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -412,13 +432,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
||||
}
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 256;
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 256;
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 256;
|
||||
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
|
||||
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
|
||||
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }};
|
||||
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -427,16 +448,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld",
|
||||
INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL],
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128],
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]);
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE],
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+280
-102
@@ -10,31 +10,52 @@
|
||||
#include "enqueue.h"
|
||||
#include "transport.h"
|
||||
#include "channel.h"
|
||||
#include <assert.h>
|
||||
|
||||
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
|
||||
__thread ncclResult_t ncclGroupError = ncclSuccess;
|
||||
__thread struct ncclComm* ncclGroupCommHead = nullptr;
|
||||
__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
|
||||
__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
|
||||
__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL;
|
||||
__thread struct ncclGroupJob ncclGroupJobMain;
|
||||
__thread int ncclGroupBlocking = -1; /* default mode */
|
||||
__thread bool ncclGroupJobAbortFlag = false;
|
||||
|
||||
void* ncclAsyncJobMain(void* arg);
|
||||
static ncclResult_t groupJobComplete(struct ncclGroupJob *job);
|
||||
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
struct ncclAsyncJob* job,
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*),
|
||||
void(*undo)(struct ncclAsyncJob*),
|
||||
void(*destructor)(void*)
|
||||
void(*destructor)(void*), ncclComm_t comm
|
||||
) {
|
||||
if (0 == ncclGroupDepth) {
|
||||
ncclResult_t res = func(job);
|
||||
if (res != ncclSuccess && undo) undo(job);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (ncclGroupDepth == 0) {
|
||||
ret = func(job);
|
||||
if (ret != ncclSuccess && undo) undo(job);
|
||||
if (destructor) destructor(job);
|
||||
return res;
|
||||
} else {
|
||||
job->func = func;
|
||||
job->undo = undo;
|
||||
job->destructor = destructor;
|
||||
job->abortFlag = comm->abortFlag;
|
||||
job->state = ncclGroupJobRunning;
|
||||
job->comm = comm;
|
||||
/* check if there are blocking and nonblocking comms at the same time in group. */
|
||||
if (ncclGroupBlocking == -1) {
|
||||
/* first met communicator */
|
||||
ncclGroupBlocking = comm->blocking;
|
||||
} else if (ncclGroupBlocking != comm->blocking) {
|
||||
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void* ncclAsyncJobMain(void* arg) {
|
||||
@@ -43,23 +64,50 @@ void* ncclAsyncJobMain(void* arg) {
|
||||
if (job->result != ncclSuccess) {
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
|
||||
}
|
||||
__atomic_store_n(&job->state, ncclGroupJobDone, __ATOMIC_RELEASE);
|
||||
return arg;
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
|
||||
ncclResult_t ret;
|
||||
SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
|
||||
if (job->result != ncclSuccess) {
|
||||
WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
|
||||
}
|
||||
ret = job->result;
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
/* if previous group launch does not complete, don't launch this one. */
|
||||
if (ncclGroupJobMainPtr != NULL) {
|
||||
if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
|
||||
ret = ncclInvalidUsage;
|
||||
goto exit;
|
||||
} else {
|
||||
NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit);
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
TRACE_CALL("ncclGroupStart()");
|
||||
return ncclSuccess;
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
|
||||
TRACE_CALL("ncclGroupEnd()");
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ncclPreconnectJob {
|
||||
@@ -145,31 +193,103 @@ failure:
|
||||
return result;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupEndInternal() {
|
||||
if (ncclGroupDepth == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
return ncclInvalidUsage;
|
||||
static inline void groupResetJobState() {
|
||||
ncclGroupBlocking = -1;
|
||||
ncclGroupJobMainPtr = NULL;
|
||||
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
|
||||
return;
|
||||
}
|
||||
|
||||
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
|
||||
struct ncclComm* comm = *groupCommHeadPtr;
|
||||
|
||||
while (comm != nullptr) {
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
(void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
|
||||
// We don't know if preconnect succeeded or happened at all, so clear
|
||||
// the flags that let `taskAppend()` skip over checking if preconnect
|
||||
// is needed.
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
comm->tasks.peers[i].sendSeen = false;
|
||||
comm->tasks.peers[i].recvSeen = false;
|
||||
comm->connectSend[i] = 0;
|
||||
comm->connectRecv[i] = 0;
|
||||
}
|
||||
comm->unlaunchedPlansHead = nullptr;
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
for (int c = 0; c < MAXCHANNELS; c++) {
|
||||
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
}
|
||||
}
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
comm->tasks.collBytesTotal = 0;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
}
|
||||
|
||||
if (!comm->blocking)
|
||||
(void) ncclCommSetAsyncError(comm, error);
|
||||
comm = next;
|
||||
}
|
||||
ncclGroupDepth--;
|
||||
if (ncclGroupDepth > 0) return ncclSuccess;
|
||||
|
||||
/* reset everything */
|
||||
while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
|
||||
*job->abortFlag = 1;
|
||||
if (job->comm && !job->comm->blocking)
|
||||
(void) ncclCommSetAsyncError(job->comm, error);
|
||||
if (job->undo) job->undo(job);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
*groupErrorPtr = ncclSuccess;
|
||||
*groupCommHeadPtr = nullptr;
|
||||
*groupCommPreconnectHeadPtr = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
int savedDev;
|
||||
CUDACHECK(hipGetDevice(&savedDev));
|
||||
|
||||
ncclResult_t ret = ncclGroupError;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
bool jobsDone = false;
|
||||
if (ret != ncclSuccess) goto failure;
|
||||
bool errorJobAbortFlag = false;
|
||||
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
|
||||
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
|
||||
volatile bool *groupAbortFlag = gjob->abortFlagPtr;
|
||||
|
||||
if (ncclGroupCommPreconnectHead != nullptr) {
|
||||
struct ncclComm* comm = ncclGroupCommPreconnectHead;
|
||||
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, fail);
|
||||
|
||||
if (groupCommPreconnectHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommPreconnectHeadMain;
|
||||
do {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECK(ncclCalloc(&job, 1));
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
|
||||
struct ncclComm* next = comm->preconnectNext;
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
@@ -177,96 +297,154 @@ ncclResult_t ncclGroupEndInternal() {
|
||||
} while (comm != nullptr);
|
||||
}
|
||||
|
||||
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
|
||||
do {
|
||||
pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
|
||||
SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
|
||||
job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
do {
|
||||
int err = pthread_join(job->thread, nullptr);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
|
||||
jobsDone = true;
|
||||
if (ret != ncclSuccess) goto failure;
|
||||
}
|
||||
|
||||
if (ncclGroupCommHead != nullptr) {
|
||||
NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
|
||||
do {
|
||||
struct ncclComm* comm = ncclGroupCommHead;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
ncclGroupCommLeave(comm);
|
||||
ncclGroupCommHead = next;
|
||||
} while (ncclGroupCommHead != nullptr);
|
||||
}
|
||||
|
||||
if (false) {
|
||||
failure:
|
||||
struct ncclComm* comm = ncclGroupCommHead;
|
||||
while (comm != nullptr) {
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
ncclGroupCommLeave(comm); // overwrites comm->groupNext
|
||||
// We don't know if preconnect succeeded or happened at all, so clear
|
||||
// the flags that let `taskAppend()` skip over checking if preconnect
|
||||
// is needed.
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i=0; i < comm->nRanks; i++) {
|
||||
comm->tasks.peers[i].sendSeen = false;
|
||||
comm->tasks.peers[i].recvSeen = false;
|
||||
comm->connectSend[i] = 0;
|
||||
comm->connectRecv[i] = 0;
|
||||
comm->connectSend[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
comm->connectRecv[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
}
|
||||
comm->unlaunchedPlansHead = nullptr;
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
jobsDone = true;
|
||||
job = ncclIntruQueueHead(asyncJobsMain);
|
||||
do {
|
||||
ncclGroupJobState_t state = __atomic_load_n(&job->state, __ATOMIC_ACQUIRE);
|
||||
if (state == ncclGroupJobRunning) {
|
||||
jobsDone = false;
|
||||
} else if (state == ncclGroupJobDone) {
|
||||
if (pthread_join(job->thread, nullptr) != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
job->state = ncclGroupJobJoined;
|
||||
if (job->result != ncclSuccess) {
|
||||
ret = job->result;
|
||||
errorJobAbortFlag = true;
|
||||
}
|
||||
} else {
|
||||
/* safety check */
|
||||
assert(state == ncclGroupJobJoined);
|
||||
}
|
||||
}
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
comm->tasks.collBytesTotal = 0;
|
||||
for (int i=0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
}
|
||||
comm = next;
|
||||
}
|
||||
|
||||
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
|
||||
*job->abortFlag = 1;
|
||||
ret = ncclInternalError;
|
||||
}
|
||||
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
} while (jobsDone == false);
|
||||
|
||||
if (ret != ncclSuccess) goto fail;
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
|
||||
if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
|
||||
}
|
||||
|
||||
/* this atomic must happen before cleanup and setting state of communicators */
|
||||
__atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE);
|
||||
|
||||
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
|
||||
if (job->comm && !job->comm->blocking)
|
||||
(void) ncclCommSetAsyncError(job->comm, ret);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
ncclGroupError = ncclSuccess;
|
||||
ncclGroupCommHead = nullptr;
|
||||
ncclGroupCommPreconnectHead = nullptr;
|
||||
CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
|
||||
while (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
(void) ncclGroupCommLeave(comm);
|
||||
if (!comm->blocking) {
|
||||
(void) ncclCommSetAsyncError(comm, ret);
|
||||
}
|
||||
groupCommHeadMain = next;
|
||||
}
|
||||
|
||||
*gjob->groupErrorPtr = ncclSuccess;
|
||||
*gjob->groupCommHeadPtr = nullptr;
|
||||
*gjob->groupCommPreconnectHeadPtr = nullptr;
|
||||
|
||||
CUDACHECK(hipSetDevice(savedDev));
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupEndInternal() {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (ncclGroupDepth == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
ret = ncclInvalidUsage;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if ((--ncclGroupDepth) > 0) goto exit;
|
||||
|
||||
if ((ret = ncclGroupError) != ncclSuccess) goto fail;
|
||||
|
||||
if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
|
||||
ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
|
||||
ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
|
||||
ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
|
||||
ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
|
||||
ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
|
||||
ncclGroupJobMain.doneFlag = false;
|
||||
ncclGroupJobMainPtr = &ncclGroupJobMain;
|
||||
/* make sure ncclGroupBlocking has been set. */
|
||||
assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
|
||||
if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
|
||||
/* nonblocking group */
|
||||
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
do {
|
||||
NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
|
||||
job = job->next;
|
||||
} while (job);
|
||||
}
|
||||
|
||||
if (ncclGroupCommHead) {
|
||||
ncclComm_t comm = ncclGroupCommHead;
|
||||
do {
|
||||
NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
|
||||
comm = comm->groupNext;
|
||||
} while (comm);
|
||||
}
|
||||
ncclGroupJobMainPtr->base.func = groupLaunch;
|
||||
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
|
||||
ret = ncclInProgress;
|
||||
} else {
|
||||
/* blocking group */
|
||||
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
|
||||
groupResetJobState();
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret);
|
||||
groupResetJobState();
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (job) {
|
||||
ret = ncclAsyncJobComplete(&job->base);
|
||||
groupResetJobState();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ncclGroupJobAbort() {
|
||||
ncclGroupJobAbortFlag = true;
|
||||
(void) groupJobComplete(ncclGroupJobMainPtr);
|
||||
/* reset group abort flag */
|
||||
ncclGroupJobAbortFlag = false;
|
||||
}
|
||||
|
||||
@@ -23,17 +23,15 @@ uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time = 0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
time = clockNano();
|
||||
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped), result, finish);
|
||||
time = clockNano() - time;
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: hipHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -95,15 +93,14 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
uint64_t time = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time = clockNano() - time;
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
|
||||
@@ -111,21 +108,17 @@ finish:
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t sideStream = nullptr, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time0=0, time1=0, time2=0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
hipStream_t stream = sideStream;
|
||||
time0 = clockNano();
|
||||
if (stream == nullptr)
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
time1 = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time2 = clockNano();
|
||||
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
|
||||
if (sideStream == nullptr)
|
||||
@@ -136,9 +129,10 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipStreamCreateWithFlags=%g hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
|
||||
finish:
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
|
||||
@@ -146,16 +140,13 @@ finish:
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time = 0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
time = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time = clockNano() - time;
|
||||
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
@@ -163,9 +154,10 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
@@ -107,7 +107,7 @@
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
@@ -116,7 +116,7 @@
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
@@ -126,7 +126,7 @@
|
||||
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
@@ -136,7 +136,7 @@
|
||||
#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
@@ -144,7 +144,7 @@
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLCHECKTHREAD(a, args) do { \
|
||||
if (((args)->ret = (a)) != ncclSuccess) { \
|
||||
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
|
||||
return args; \
|
||||
} \
|
||||
|
||||
@@ -52,8 +52,9 @@ struct ncclDevRedOpFull {
|
||||
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
|
||||
|
||||
#define SINGLE_ARG(...) __VA_ARGS__
|
||||
#define CONCAT(a,b) a##b
|
||||
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
|
||||
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
|
||||
#define MACRO_IF_0(t, f) f
|
||||
#define MACRO_IF_1(t, f) t
|
||||
|
||||
@@ -65,7 +66,8 @@ struct ncclDevRedOpFull {
|
||||
#define DECL3(func, devredop, type, undef) \
|
||||
DECL4(func, RING, devredop, type, undef) \
|
||||
DECL4(func, TREE, devredop, type, undef) \
|
||||
DECL4(func, COLLNET, devredop, type, undef)
|
||||
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
|
||||
DECL4(func, COLLNET_CHAIN, devredop, type, undef)
|
||||
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#define DECL2(func, devredop, undefForFloat) \
|
||||
@@ -132,7 +134,6 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
|
||||
#define BROADCAST_CHUNKSTEPS 1
|
||||
#define REDUCE_SLICESTEPS 1
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
#define SENDRECV_SLICEFACTOR 4
|
||||
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
|
||||
#define ALLTOALL_PIVOT_SLICESTEPS 2
|
||||
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
|
||||
|
||||
+28
-11
@@ -107,8 +107,9 @@ struct ncclChannel {
|
||||
struct ncclRing ring;
|
||||
int* devRingUserRanks;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collnetChain;
|
||||
struct ncclDirect collnetDirect;
|
||||
struct ncclTree binTree;
|
||||
struct ncclDirect collTree;
|
||||
int id; // index of this channel
|
||||
uint32_t workFifoSent; // last used work index+1
|
||||
uint64_t p2pOpCount;
|
||||
@@ -134,6 +135,7 @@ struct ncclKernelPlan {
|
||||
struct ncclKernelPlan* next;
|
||||
|
||||
bool persistent; // aka captured in a graph
|
||||
bool kernelSpecialized;
|
||||
void *kernelFn;
|
||||
int channelUbound; // only channels c < channelUbound are present
|
||||
int channelCount; // number of channels present
|
||||
@@ -209,8 +211,12 @@ struct ncclComm {
|
||||
int p2pnChannelsPerPeer;
|
||||
int p2pChannels[MAXCHANNELS];
|
||||
|
||||
// Should this comm allocate LL buffers for network P2P connections?
|
||||
bool allocP2pNetLLBuffers;
|
||||
|
||||
// Buffer sizes
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pNetChunkSize;
|
||||
|
||||
// Algorithm/Protocols thresholds
|
||||
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
@@ -218,8 +224,9 @@ struct ncclComm {
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
/* This attribute can indicate the states of communicators and return code of
|
||||
* asynchronous NCCL operations. */
|
||||
ncclResult_t asyncResult;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
@@ -301,12 +308,16 @@ struct ncclComm {
|
||||
pthread_t collTraceThread;
|
||||
volatile bool collTraceExit;
|
||||
#endif
|
||||
};
|
||||
|
||||
// Set to true during an `atexit()` handler. We use this to intentionally leak
|
||||
// unfreed CUDA resources when cleaning up after return of `main()` to avoid
|
||||
// CUDA calls after CUDA runtime teardown.
|
||||
extern bool ncclMainExited;
|
||||
// communicator mode
|
||||
int blocking;
|
||||
// initState is to more conveniently reclaim resources when errors happen.
|
||||
ncclResult_t initState;
|
||||
// flag to indicate if ncclCommFinalize() is called
|
||||
bool finalizeCalled;
|
||||
// shared structures for finalization
|
||||
int finalizeRankCnt;
|
||||
};
|
||||
|
||||
enum ncclLaunchMode {
|
||||
ncclLaunchModeInvalid=0,
|
||||
@@ -320,13 +331,16 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
|
||||
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
|
||||
void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
|
||||
|
||||
inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
|
||||
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
|
||||
inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
|
||||
while (cb != nullptr) {
|
||||
struct ncclCommCallback* next = cb->next;
|
||||
NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
|
||||
ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
|
||||
if (res1 != ncclSuccess) result = res1;
|
||||
cb = next;
|
||||
}
|
||||
NCCLCHECK(result);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -383,4 +397,7 @@ static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
|
||||
return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
|
||||
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -12,9 +12,9 @@
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
#else
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
|
||||
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
|
||||
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
|
||||
#endif
|
||||
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
@@ -60,27 +60,27 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn,
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuInit);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
|
||||
|
||||
|
||||
ncclResult_t cudaLibraryInit(void);
|
||||
|
||||
@@ -21,10 +21,11 @@
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
|
||||
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
#define NCCL_ALGO_COLLNET 2
|
||||
#define NCCL_ALGO_COLLNET_DIRECT 2
|
||||
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
@@ -234,8 +235,9 @@ static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWork
|
||||
|
||||
struct ncclWorkElemP2p {
|
||||
struct {
|
||||
int32_t peer:30;
|
||||
int32_t peer:28;
|
||||
uint32_t connIndex:2;
|
||||
int32_t proto:2;
|
||||
};
|
||||
union {
|
||||
uint16_t flagBits;
|
||||
@@ -356,8 +358,9 @@ struct alignas(16) ncclDevChannel {
|
||||
struct ncclDevChannelPeer *peers;
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collnetChain;
|
||||
struct ncclDirect collnetDirect;
|
||||
struct ncclTree binTree;
|
||||
struct ncclDirect collTree;
|
||||
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
||||
};
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int ne
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
|
||||
@@ -57,6 +58,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
@@ -78,8 +80,8 @@ struct ncclTopoGraph {
|
||||
int maxChannels;
|
||||
// Output
|
||||
int nChannels;
|
||||
float speedIntra;
|
||||
float speedInter;
|
||||
float bwIntra;
|
||||
float bwInter;
|
||||
float latencyInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,7 +13,18 @@
|
||||
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm);
|
||||
void ncclGroupCommPreconnect(struct ncclComm* comm);
|
||||
void ncclGroupCommLeave(struct ncclComm* comm);
|
||||
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
|
||||
void ncclGroupJobAbort();
|
||||
|
||||
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
||||
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
||||
|
||||
typedef enum ncclGroupJobState {
|
||||
ncclGroupJobRunning = 0,
|
||||
ncclGroupJobDone = 1,
|
||||
ncclGroupJobJoined = 2,
|
||||
} ncclGroupJobState_t;
|
||||
|
||||
struct ncclAsyncJob {
|
||||
struct ncclAsyncJob* next;
|
||||
@@ -23,17 +33,31 @@ struct ncclAsyncJob {
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*);
|
||||
void(*undo)(struct ncclAsyncJob*);
|
||||
void(*destructor)(void*);
|
||||
ncclGroupJobState_t state;
|
||||
volatile uint32_t *abortFlag; /* point to comm abortFlag */
|
||||
ncclComm_t comm;
|
||||
};
|
||||
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
struct ncclAsyncJob* job,
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*),
|
||||
void(*undo)(struct ncclAsyncJob*),
|
||||
void(*destructor)(void*)
|
||||
void(*destructor)(void*), ncclComm_t comm
|
||||
);
|
||||
|
||||
struct ncclGroupJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm **groupCommHeadPtr;
|
||||
struct ncclComm **groupCommPreconnectHeadPtr;
|
||||
ncclResult_t *groupErrorPtr;
|
||||
volatile bool *abortFlagPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
|
||||
bool doneFlag;
|
||||
};
|
||||
|
||||
ncclResult_t ncclGroupStartInternal();
|
||||
ncclResult_t ncclGroupEndInternal();
|
||||
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -41,6 +65,7 @@ extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
|
||||
extern __thread ncclResult_t ncclGroupError;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead;
|
||||
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
||||
extern __thread int ncclGroupBlocking;
|
||||
|
||||
inline ncclResult_t ncclGroupStartInternal() {
|
||||
ncclGroupDepth++;
|
||||
@@ -49,7 +74,7 @@ inline ncclResult_t ncclGroupStartInternal() {
|
||||
|
||||
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupDepth > 0) {
|
||||
if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
|
||||
if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -69,6 +94,8 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
// this comm is allocated there.
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
}
|
||||
|
||||
ncclGroupBlocking = comm->blocking;
|
||||
}
|
||||
|
||||
// Add comm to this thread's group needing preconnect
|
||||
@@ -80,9 +107,10 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
// Comm has left group
|
||||
inline void ncclGroupCommLeave(struct ncclComm* comm) {
|
||||
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
ncclMemoryStackPop(&comm->memScoped);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -23,7 +23,8 @@ typedef enum : uint8_t {
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollTreeUpDown,
|
||||
ncclPatternCollnetChain,
|
||||
ncclPatternCollnetDirect,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
@@ -173,6 +173,7 @@ struct ncclProxyState {
|
||||
struct ncclSocket* listenSock;
|
||||
int stop;
|
||||
hipCtx_t cudaCtx;
|
||||
int safeAbortFlag;
|
||||
|
||||
// Used by main thread
|
||||
union ncclSocketAddress* peerAddresses;
|
||||
@@ -192,6 +193,7 @@ struct ncclProxyConnection {
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclProxyArgs **proxyAppendPtr;
|
||||
void* transportResources;
|
||||
bool initFlag;
|
||||
};
|
||||
|
||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
+451
-118
@@ -49,7 +49,7 @@
|
||||
#endif
|
||||
|
||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAllPivot" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" };
|
||||
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
||||
const char* ncclDevRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "Max", "Min", "PreMulSum", "SumPostDiv" };
|
||||
const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"};
|
||||
@@ -57,6 +57,8 @@ const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "
|
||||
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||
|
||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0);
|
||||
|
||||
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
|
||||
|
||||
static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
||||
@@ -90,17 +92,10 @@ pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static bool initialized = false;
|
||||
static size_t maxLocalSizeBytes = 0;
|
||||
|
||||
bool ncclMainExited = false;
|
||||
|
||||
static void atexitHandler() {
|
||||
ncclMainExited = true;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclInit() {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
|
||||
pthread_mutex_lock(&initLock);
|
||||
if (!initialized) {
|
||||
atexit(atexitHandler);
|
||||
initEnv();
|
||||
initGdrCopy();
|
||||
maxLocalSizeBytes = ncclKernMaxLocalSize();
|
||||
@@ -299,46 +294,11 @@ void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
|
||||
comm->destructorHead = dtor;
|
||||
}
|
||||
|
||||
void commZombieCleanup(struct ncclComm* comm) {
|
||||
ncclMemoryStackDestruct(&comm->memScoped);
|
||||
ncclMemoryStackDestruct(&comm->memPermanent);
|
||||
|
||||
struct ncclComm* intraComm0 = comm->intraComm0;
|
||||
if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
|
||||
// Wait for all service threads to be done. We could not
|
||||
// do it earlier because it could have blocked and prevented
|
||||
// other ranks in the process to call ncclCommDestroy
|
||||
comm = intraComm0;
|
||||
while (comm != nullptr) {
|
||||
if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
|
||||
struct ncclComm* next = comm->intraNext;
|
||||
free(comm);
|
||||
comm = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void* commZombieMain(void* arg) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclComm* comm = (struct ncclComm*)arg;
|
||||
while (comm->persistentRefs != 0) {
|
||||
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/true);
|
||||
while (cb != nullptr) {
|
||||
struct ncclCommCallback* next = cb->next;
|
||||
NCCLCHECKGOTO(cb->fn(comm, cb), result, ignore); // may reclaim memory of cb
|
||||
ignore:
|
||||
cb = next;
|
||||
}
|
||||
}
|
||||
commZombieCleanup(comm);
|
||||
return arg;
|
||||
}
|
||||
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
// First stop all threads before we free anything.
|
||||
// Stop all threads before we free anything.
|
||||
NCCLCHECK(ncclProxyDestroy(comm));
|
||||
|
||||
delete[] comm->userRedOps;
|
||||
@@ -371,9 +331,12 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
#endif
|
||||
|
||||
free(comm->peerInfo);
|
||||
ncclTopoFree(comm->topo);
|
||||
for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
|
||||
free(comm->nodeRanks);
|
||||
if (comm->topo)
|
||||
ncclTopoFree(comm->topo);
|
||||
if (comm->nodeRanks) {
|
||||
for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
|
||||
free(comm->nodeRanks);
|
||||
}
|
||||
free(comm->rankToNode);
|
||||
free(comm->rankToLocalRank);
|
||||
|
||||
@@ -386,10 +349,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm->doneEvent != NULL)
|
||||
CUDACHECK(hipEventDestroy(comm->doneEvent));
|
||||
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
|
||||
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
|
||||
if (comm->initState == ncclSuccess) {
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
|
||||
}
|
||||
|
||||
struct ncclDestructor* dtor = comm->destructorHead;
|
||||
while (dtor != nullptr) {
|
||||
@@ -398,16 +361,34 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
}
|
||||
CUDACHECK(hipStreamDestroy(comm->sideStream));
|
||||
|
||||
ncclMemoryStackDestruct(&comm->memScoped);
|
||||
ncclMemoryStackDestruct(&comm->memPermanent);
|
||||
|
||||
commPoison(comm); // Important that this does not interfere with anything used below.
|
||||
|
||||
if (comm->persistentRefs == 0) {
|
||||
commZombieCleanup(comm);
|
||||
if (comm->initState == ncclSuccess) {
|
||||
struct ncclComm* intraComm0 = comm->intraComm0;
|
||||
if (0 == ncclAtomicRefCountDecrement(&intraComm0->intraRefs)) {
|
||||
// Wait for all service threads to be done. We could not
|
||||
// do it earlier because it could have blocked and prevented
|
||||
// other ranks in the process to call ncclCommDestroy
|
||||
comm = intraComm0;
|
||||
while (comm != nullptr) {
|
||||
if (comm->proxyState.thread) pthread_join(comm->proxyState.thread, nullptr);
|
||||
struct ncclComm* next = comm->intraNext;
|
||||
free(comm);
|
||||
comm = next;
|
||||
}
|
||||
}
|
||||
} else if (comm->proxyState.thread) {
|
||||
pthread_join(comm->proxyState.thread, nullptr);
|
||||
ncclCudaHostFree((void *)comm->abortFlag);
|
||||
free(comm);
|
||||
} else {
|
||||
// Spawn a thread to listen for remaining messages from graph cleanup.
|
||||
pthread_t zombie;
|
||||
pthread_create(&zombie, nullptr, commZombieMain, comm);
|
||||
pthread_detach(zombie);
|
||||
ncclCudaHostFree((void *)comm->abortFlag);
|
||||
free(comm);
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -445,6 +426,26 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
||||
/* comm must be ready, or error will be reported */
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (*comm->abortFlag) {
|
||||
ncclGroupJobAbort();
|
||||
} else {
|
||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||
if (ret != ncclSuccess) {
|
||||
/* if ret is not ncclInProgress, we just keep it. */
|
||||
WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n");
|
||||
if (ret == ncclInProgress) ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtualId) {
|
||||
if (ndev < 1) {
|
||||
WARN("invalid device count (%d) requested", ndev);
|
||||
@@ -456,7 +457,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
|
||||
}
|
||||
|
||||
struct ncclComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
/* Cuurently we calloc comm in ncclCommInitRankDev for async function support.
|
||||
* This 'if' structure is designed to consider the case where commAlloc is called
|
||||
* in other cases except ncclCommInitRankDev. */
|
||||
if (*comret == NULL) {
|
||||
/* user requests a new communicator */
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
|
||||
NCCLCHECK(ncclCommSetAsyncError(comm, ncclInProgress));
|
||||
} else {
|
||||
/* We already allocated a communicator in ncclCommInitRankDev. */
|
||||
comm = *comret;
|
||||
}
|
||||
|
||||
ncclMemoryStackConstruct(&comm->memPermanent);
|
||||
ncclMemoryStackConstruct(&comm->memScoped);
|
||||
comm->destructorHead = nullptr;
|
||||
@@ -485,10 +498,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
|
||||
CUDACHECK(hipStreamCreateWithFlags(&comm->sideStream, hipStreamNonBlocking));
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
|
||||
comm->fatalError = ncclSuccess;
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
|
||||
*comm->abortFlag = 0;
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t **)&comm->collTraceTail, 1));
|
||||
@@ -572,8 +581,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
|
||||
tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks;
|
||||
tmpCommAndChans.channels[c].tree = comm->channels[c].tree;
|
||||
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
|
||||
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
|
||||
tmpCommAndChans.channels[c].binTree = comm->channels[c].binTree;
|
||||
tmpCommAndChans.channels[c].collTree = comm->channels[c].collTree;
|
||||
tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
|
||||
|
||||
if (comm->channels[c].ring.userRanks != nullptr) {
|
||||
@@ -682,6 +692,8 @@ NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
|
||||
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
|
||||
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
|
||||
|
||||
NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
|
||||
|
||||
static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
@@ -694,12 +706,15 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
|
||||
comm->p2pNetChunkSize = ncclParamP2pNetChunkSize();
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
||||
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0);
|
||||
NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
||||
// We use 2 AllGathers
|
||||
@@ -825,6 +840,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
// Initialize num P2P LL buffers for this communicator
|
||||
comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
|
||||
|
||||
if (comm->rank == ncclParamGraphDumpFileRank()) {
|
||||
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
|
||||
@@ -856,8 +873,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int pattern;
|
||||
int nChannels;
|
||||
int sameChannels;
|
||||
float speedIntra;
|
||||
float speedInter;
|
||||
float bwIntra;
|
||||
float bwInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
};
|
||||
@@ -900,22 +917,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
allGather3Data[rank].tree.pattern = treeGraph.pattern;
|
||||
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
|
||||
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
|
||||
allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra;
|
||||
allGather3Data[rank].tree.bwInter = treeGraph.bwInter;
|
||||
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
|
||||
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
|
||||
allGather3Data[rank].ring.pattern = ringGraph.pattern;
|
||||
allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
|
||||
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
|
||||
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
|
||||
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
|
||||
allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra;
|
||||
allGather3Data[rank].ring.bwInter = ringGraph.bwInter;
|
||||
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
|
||||
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
|
||||
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
|
||||
allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
|
||||
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
|
||||
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
|
||||
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
|
||||
allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra;
|
||||
allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter;
|
||||
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
|
||||
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
|
||||
allGather3Data[rank].collNetSupport = comm->collNetSupport;
|
||||
@@ -990,20 +1007,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
// Make sure we align all ranks so that the tuning is consistent across ranks
|
||||
treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
|
||||
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
|
||||
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
|
||||
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
|
||||
treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra);
|
||||
treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter);
|
||||
treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
|
||||
treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
|
||||
ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
|
||||
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
|
||||
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
|
||||
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
|
||||
ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra);
|
||||
ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter);
|
||||
ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
|
||||
ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
|
||||
collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
|
||||
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
|
||||
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
|
||||
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
|
||||
collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra);
|
||||
collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter);
|
||||
collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
|
||||
collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
|
||||
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
|
||||
@@ -1145,16 +1162,38 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||
|
||||
// Connect intra-node CollNet
|
||||
char line[1024];
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* chain = &comm->channels[c].collnetChain;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d",
|
||||
c, chain->down[0], rank, chain->up);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Collnet Chains %s", line);
|
||||
// Connect Collnet + chain
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||
INFO(NCCL_INIT, "Connected collnet + chain");
|
||||
|
||||
// Connect intra-node CollNet + Direct
|
||||
int highestTransportType0, highestTransportType1;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
|
||||
|
||||
@@ -1331,6 +1370,8 @@ collnet_cleanup:
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
|
||||
|
||||
/* Local intra-node barrier */
|
||||
NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
||||
|
||||
@@ -1358,9 +1399,15 @@ struct ncclCommInitRankAsyncJob {
|
||||
int virtualId;
|
||||
};
|
||||
|
||||
struct ncclCommFinalizeAsyncJob {
|
||||
struct ncclAsyncJob base;
|
||||
ncclComm_t comm;
|
||||
};
|
||||
|
||||
static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
|
||||
ncclComm_t* newcomm = job->newcomm;
|
||||
ncclComm_t comm = *newcomm;
|
||||
int nranks = job->nranks;
|
||||
ncclUniqueId commId = job->commId; // C++ struct assignment
|
||||
int myrank = job->myrank;
|
||||
@@ -1375,60 +1422,86 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
|
||||
//CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes));
|
||||
}
|
||||
*newcomm = NULL;
|
||||
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank, virtualId), res, cleanup);
|
||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
||||
|
||||
// update communicator state
|
||||
comm->initState = ncclSuccess;
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %ld used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, ncclKernLocalSize(ncclGetKernelIndex(*newcomm)), allocTracker[(*newcomm)->cudaDev].totalAllocSize);
|
||||
TRACE_CALL("ncclCommInitRank(%p,%d,0x%llx,%d,%d)", *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
|
||||
return ncclSuccess;
|
||||
cleanup:
|
||||
if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
|
||||
*newcomm = NULL;
|
||||
comm->initState = res;
|
||||
return res;
|
||||
}
|
||||
|
||||
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
/* first set configuration */
|
||||
if (config) {
|
||||
comm->blocking = config->blocking;
|
||||
} else {
|
||||
/* default setting of communicator */
|
||||
comm->blocking = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
|
||||
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
|
||||
ncclCommDestroy(*job->newcomm);
|
||||
*job->newcomm = nullptr;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, int virtualId) {
|
||||
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config, int virtualId) {
|
||||
ncclResult_t res;
|
||||
ncclComm_t comm = NULL;
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env && myrank == 0) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, fail);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclInit(), res, end);
|
||||
NCCLCHECKGOTO(ncclInit(), res, fail);
|
||||
if (myrank == 0) showVersion();
|
||||
|
||||
memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker));
|
||||
// Make sure the CUDA runtime is initialized.
|
||||
CUDACHECKGOTO(hipFree(NULL), res, end);
|
||||
CUDACHECKGOTO(hipFree(NULL), res, fail);
|
||||
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end);
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
|
||||
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
|
||||
WARN("Invalid rank requested : %d/%d", myrank, nranks);
|
||||
res = ncclInvalidArgument;
|
||||
goto end;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
struct ncclCommInitRankAsyncJob *job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, end);
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
|
||||
// set up comm state and abortFlag only
|
||||
*comm->abortFlag = 0;
|
||||
NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
|
||||
/* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
|
||||
comm->initState = ncclInternalError;
|
||||
*newcomm = comm;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
|
||||
job->newcomm = newcomm;
|
||||
job->nranks = nranks;
|
||||
job->commId = commId; // C++ struct assignment
|
||||
job->myrank = myrank;
|
||||
job->cudaDev = cudaDev;
|
||||
job->virtualId = virtualId;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, ncclCommInitRankUndo, free), res, end);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
|
||||
|
||||
end:
|
||||
exit:
|
||||
return ncclGroupErrCheck(res);
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
@@ -1440,7 +1513,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
||||
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, -1));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL, -1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1449,7 +1522,7 @@ ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, virtualId));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL, virtualId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1457,51 +1530,164 @@ ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
||||
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int totalnDev;
|
||||
int *gpuFlags = NULL;
|
||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||
(void) rocmLibraryInit();
|
||||
if (ncclParamDmaBufEnable()) (void) rocmLibraryInit();
|
||||
|
||||
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
||||
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
|
||||
if (ndev < 0) {
|
||||
WARN("Invalid device count requested : %d", ndev);
|
||||
return ncclInvalidArgument;
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
CUDACHECKGOTO(hipGetDeviceCount(&totalnDev), ret, fail);
|
||||
if (devlist) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail);
|
||||
for (int i = 0; i < ndev; ++i) {
|
||||
/* invalid device check. */
|
||||
if (devlist[i] < 0 || devlist[i] >= totalnDev) {
|
||||
ret = ncclUnhandledCudaError;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* duplicate device check. */
|
||||
if (gpuFlags[devlist[i]] != 0) {
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
gpuFlags[devlist[i]] = 1;
|
||||
}
|
||||
free(gpuFlags);
|
||||
}
|
||||
|
||||
ncclUniqueId uniqueId;
|
||||
NCCLCHECK(ncclGetUniqueId(&uniqueId));
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
|
||||
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
|
||||
for (int i=0; i<ndev; i++) {
|
||||
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
|
||||
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, -1);
|
||||
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL, -1);
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (gpuFlags) free(gpuFlags);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||
WARN("comm %p has already been destroyed", comm);
|
||||
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
|
||||
if (nextState < 0 || nextState >= ncclNumResults || comm == NULL) {
|
||||
WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
__atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
int cudaDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||
ncclConfig_t *internalConfigPtr;
|
||||
size_t realSize;
|
||||
int blockingEnv;
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
internalConfigPtr = &internalConfig;
|
||||
if (config) {
|
||||
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
|
||||
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
|
||||
memcpy((void*)internalConfigPtr, (void*)config, realSize);
|
||||
if (internalConfigPtr->magic != 0xcafebeef) {
|
||||
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
|
||||
ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* check input config attributes */
|
||||
if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
|
||||
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
|
||||
ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* overwrite configuration from env variable. */
|
||||
blockingEnv = ncclParamCommBlocking();
|
||||
if (blockingEnv != 0 && blockingEnv != 1) {
|
||||
WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
|
||||
}
|
||||
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
|
||||
|
||||
if (ncclParamDmaBufEnable()) (void) rocmLibraryInit();
|
||||
CUDACHECKGOTO(hipGetDevice(&cudaDev), ret, exit);
|
||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr, -1), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
return ret;
|
||||
fail:
|
||||
if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
|
||||
struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
|
||||
ncclComm_t comm = job->comm;
|
||||
int savedDevice;
|
||||
#ifdef ENABLE_TRACE
|
||||
int rank = comm->rank;
|
||||
#endif
|
||||
CUDACHECK(hipGetDevice(&savedDevice));
|
||||
int commDevice = comm->cudaDev;
|
||||
ncclResult_t ret;
|
||||
|
||||
CUDACHECKGOTO(hipGetDevice(&savedDevice), ret, fail);
|
||||
if (savedDevice != commDevice) {
|
||||
CUDACHECKGOTO(hipSetDevice(commDevice), ret, fail);
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
|
||||
|
||||
if (comm->initState == ncclSuccess) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->deviceStream), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
|
||||
// And keep polling until all graphs referencing us die.
|
||||
while (comm->persistentRefs != 0) {
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
|
||||
}
|
||||
|
||||
if (savedDevice != commDevice) {
|
||||
CUDACHECKGOTO(hipSetDevice(savedDevice), ret, fail);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t commCleanup(ncclComm_t comm) {
|
||||
int savedDevice;
|
||||
int commDevice = comm->cudaDev;
|
||||
|
||||
CUDACHECK(hipGetDevice(&savedDevice));
|
||||
if (savedDevice != commDevice) {
|
||||
CUDACHECK(hipSetDevice(commDevice));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
|
||||
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&comm->hostStream));
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&comm->deviceStream));
|
||||
NCCLCHECK(ncclCommPollCallbacks(comm));
|
||||
|
||||
NCCLCHECK(commFree(comm));
|
||||
|
||||
if (savedDevice != commDevice)
|
||||
@@ -1521,6 +1707,125 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
|
||||
comm->finalizeCalled = true;
|
||||
/* launch async thread to finalize comm. */
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->comm = comm;
|
||||
|
||||
if (userCalled) {
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail);
|
||||
free(job);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ncclGroupErrCheck(ret);
|
||||
fail:
|
||||
if (job) free(job);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
|
||||
ncclResult_t ncclCommFinalize(ncclComm_t comm) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
if (comm == NULL) goto exit;
|
||||
|
||||
/* wait comm ready before finalize. */
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
|
||||
|
||||
/* prevent double finalize. */
|
||||
if (comm->finalizeCalled) {
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* finalize comm. */
|
||||
ret = commFinalize(comm, true);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (comm && !comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
|
||||
return ret;
|
||||
fail:
|
||||
if (comm && !comm->blocking) (void) ncclCommSetAsyncError(comm, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t commReclaim(ncclComm_t comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclResult_t state;
|
||||
int curRank; /* Debug info */
|
||||
|
||||
NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
|
||||
TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
|
||||
if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) {
|
||||
/* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
|
||||
* should be nonblocking until last call of ncclCommDestroy. */
|
||||
NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
|
||||
}
|
||||
|
||||
if (comm->initState != ncclSuccess) {
|
||||
/* if init errors happen, no finalize thread should have been launched. Main thread can reclaim
|
||||
* everything since no NCCL kernel was issued. */
|
||||
struct ncclCommFinalizeAsyncJob job;
|
||||
|
||||
job.comm = comm;
|
||||
curRank = comm->rank;
|
||||
/* comm aborts, commDestroySync should not be blocked. */
|
||||
if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) {
|
||||
WARN("commReclaim: comm %p (rank = %d) in abort, error %d", comm, curRank, ret);
|
||||
}
|
||||
|
||||
if ((ret = commCleanup(comm)) != ncclSuccess) {
|
||||
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", comm, curRank, ret);
|
||||
}
|
||||
} else {
|
||||
int curRankCnt;
|
||||
int intraRanks = comm->intraRanks;
|
||||
ncclComm_t intracomm0 = comm->intraComm0;
|
||||
int *finalizeRankCnt = &intracomm0->finalizeRankCnt;
|
||||
|
||||
assert(intracomm0 != NULL && finalizeRankCnt != NULL);
|
||||
curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL);
|
||||
if (curRankCnt == intraRanks) {
|
||||
ncclComm_t curIntraComm;
|
||||
ncclComm_t nextIntraComm = intracomm0;
|
||||
|
||||
while (nextIntraComm) {
|
||||
curIntraComm = nextIntraComm;
|
||||
curRank = curIntraComm->rank;
|
||||
nextIntraComm = nextIntraComm->intraNext;
|
||||
|
||||
if (comm->finalizeCalled == false) {
|
||||
struct ncclCommFinalizeAsyncJob job;
|
||||
job.comm = curIntraComm;
|
||||
/* every comm aborts, commDestroySync should not be blocked. */
|
||||
if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess)
|
||||
WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret);
|
||||
}
|
||||
|
||||
if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
|
||||
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
@@ -1530,9 +1835,18 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
|
||||
int64_t busId = comm->busId;
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||
WARN("comm %p has already been destroyed", comm);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
NCCLCHECK(commDestroy(comm));
|
||||
/* init thread must be joined before we destroy the comm. */
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
NCCLCHECK(commReclaim(comm));
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1548,9 +1862,13 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
|
||||
// Ask anything that might still be running on the device to quit
|
||||
*comm->abortFlag = 1;
|
||||
/* init thread must be joined before we destroy the comm,
|
||||
* and we should ignore the init error here. */
|
||||
ncclCommEnsureReady(comm);
|
||||
|
||||
//NCCLCHECK(commDestroy(comm));
|
||||
(void) commReclaim(comm);
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1564,6 +1882,7 @@ const char* ncclGetErrorString(ncclResult_t code) {
|
||||
case ncclInvalidArgument : return "invalid argument";
|
||||
case ncclInvalidUsage : return "invalid usage";
|
||||
case ncclRemoteError : return "remote process exited or there was a network error";
|
||||
case ncclInProgress : return "NCCL operation in progress";
|
||||
default : return "unknown result code";
|
||||
}
|
||||
}
|
||||
@@ -1580,15 +1899,21 @@ NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asy
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
|
||||
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
|
||||
*asyncError = comm->fatalError;
|
||||
|
||||
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
||||
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
|
||||
|
||||
/* init thread must be joined before we access the attributes of comm. */
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
*count = comm->nRanks;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1596,8 +1921,12 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
||||
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
|
||||
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
*devid = comm->cudaDev;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1605,8 +1934,12 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
||||
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
|
||||
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
*rank = comm->rank;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+43
-52
@@ -10,32 +10,30 @@
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName, 6000);
|
||||
/* enqueue.cc */
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
|
||||
/* proxy.cc */
|
||||
DECLARE_CUDA_PFN(cuCtxCreate_v3020);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* transport/collNet.cc/net.cc*/
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN(cuInit);
|
||||
DECLARE_CUDA_PFN(cuDriverGetVersion);
|
||||
DECLARE_CUDA_PFN(cuGetProcAddress);
|
||||
|
||||
static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
|
||||
DECLARE_CUDA_PFN(cuInit, 2000);
|
||||
DECLARE_CUDA_PFN(cuDriverGetVersion, 2020);
|
||||
DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
|
||||
|
||||
#define CUDA_DRIVER_MIN_VERSION 11030
|
||||
|
||||
@@ -46,46 +44,37 @@ static int cudaDriverVersion;
|
||||
/*
|
||||
Load the CUDA symbols
|
||||
*/
|
||||
static int cudaPfnFuncLoader(void) {
|
||||
static ncclResult_t cudaPfnFuncLoader(void) {
|
||||
CUresult res;
|
||||
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
|
||||
#define LOAD_SYM(symbol, version, ignore) do { \
|
||||
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \
|
||||
if (res != 0) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
|
||||
WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
|
||||
LOAD_SYM(cuGetErrorString, 0);
|
||||
LOAD_SYM(cuGetErrorName, 0);
|
||||
LOAD_SYM(cuDeviceGet, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 1);
|
||||
LOAD_SYM(cuCtxCreate_v3020, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 1);
|
||||
LOAD_SYM(cuGetErrorString, 6000, 0);
|
||||
LOAD_SYM(cuGetErrorName, 6000, 0);
|
||||
LOAD_SYM(cuDeviceGet, 2000, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
||||
LOAD_SYM(cuCtxCreate, 3020, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
||||
#if CUDA_VERSION >= 11070
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
ncclResult_t cudaLibraryInit(void) {
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
static ncclResult_t initResult;
|
||||
|
||||
static void initOnceFunc() {
|
||||
CUresult res;
|
||||
|
||||
if (cudaState == cudaInitialized)
|
||||
return ncclSuccess;
|
||||
if (cudaState == cudaError)
|
||||
return ncclSystemError;
|
||||
|
||||
if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (cudaState == cudaInitializing) sched_yield();
|
||||
return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load CUDA driver library
|
||||
*/
|
||||
@@ -106,13 +95,13 @@ ncclResult_t cudaLibraryInit(void) {
|
||||
* Load initial CUDA functions
|
||||
*/
|
||||
|
||||
pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
|
||||
pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit");
|
||||
if (pfn_cuInit == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuInit");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
|
||||
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion");
|
||||
if (pfn_cuDriverGetVersion == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
|
||||
goto error;
|
||||
@@ -132,7 +121,7 @@ ncclResult_t cudaLibraryInit(void) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
|
||||
pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress");
|
||||
if (pfn_cuGetProcAddress == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
|
||||
goto error;
|
||||
@@ -145,19 +134,21 @@ ncclResult_t cudaLibraryInit(void) {
|
||||
*/
|
||||
pfn_cuInit(0);
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (cudaPfnFuncLoader()) {
|
||||
WARN("CUDA some PFN functions not found in the library");
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaState = cudaInitialized;
|
||||
return ncclSuccess;
|
||||
#endif
|
||||
|
||||
initResult = ncclSuccess;
|
||||
return;
|
||||
error:
|
||||
cudaState = cudaError;
|
||||
return ncclSystemError;
|
||||
initResult = ncclSystemError;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t cudaLibraryInit() {
|
||||
pthread_once(&initOnceControl, initOnceFunc);
|
||||
return initResult;
|
||||
}
|
||||
|
||||
+12
-17
@@ -9,8 +9,6 @@
|
||||
#ifndef GDR_DIRECT
|
||||
#include "core.h"
|
||||
|
||||
static enum { gdrUninitialized, gdrInitializing, gdrInitialized, gdrError } gdrState = gdrUninitialized;
|
||||
|
||||
/* Function pointers assigned from dlopen() */
|
||||
static gdr_t (*gdr_internal_open)(void);
|
||||
static int (*gdr_internal_close)(gdr_t g);
|
||||
@@ -49,18 +47,10 @@ pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
*cast = tmp; \
|
||||
} while (0)
|
||||
|
||||
ncclResult_t wrap_gdr_symbols(void) {
|
||||
if (gdrState == gdrInitialized)
|
||||
return ncclSuccess;
|
||||
if (gdrState == gdrError)
|
||||
return ncclSystemError;
|
||||
|
||||
if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (gdrState == gdrInitializing) sched_yield();
|
||||
return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
static ncclResult_t initResult;
|
||||
|
||||
static void initOnceFunc(void) {
|
||||
static void* gdrhandle = NULL;
|
||||
void* tmp;
|
||||
void** cast;
|
||||
@@ -84,8 +74,8 @@ ncclResult_t wrap_gdr_symbols(void) {
|
||||
LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping);
|
||||
LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping);
|
||||
|
||||
gdrState = gdrInitialized;
|
||||
return ncclSuccess;
|
||||
initResult = ncclSuccess;
|
||||
return;
|
||||
|
||||
teardown:
|
||||
gdr_internal_open = NULL;
|
||||
@@ -101,11 +91,16 @@ teardown:
|
||||
gdr_internal_copy_from_mapping = NULL;
|
||||
|
||||
if (gdrhandle != NULL) dlclose(gdrhandle);
|
||||
gdrState = gdrError;
|
||||
return ncclSystemError;
|
||||
initResult = ncclSystemError;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t wrap_gdr_symbols(void) {
|
||||
pthread_once(&initOnceControl, initOnceFunc);
|
||||
return initResult;
|
||||
}
|
||||
|
||||
gdr_t wrap_gdr_open(void) {
|
||||
if (gdr_internal_open == NULL) {
|
||||
WARN("GDRCOPY lib wrapper not initialized.");
|
||||
|
||||
+16
-20
@@ -11,8 +11,6 @@
|
||||
#include <dlfcn.h>
|
||||
#include "core.h"
|
||||
|
||||
static enum { ibvUninitialized, ibvInitializing, ibvInitialized, ibvError } ibvState = ibvUninitialized;
|
||||
|
||||
/*Function Pointers*/
|
||||
int (*ibv_internal_fork_init)(void);
|
||||
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
|
||||
@@ -43,18 +41,10 @@ const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
|
||||
// IBVERBS Library versioning
|
||||
#define IBVERBS_VERSION "IBVERBS_1.1"
|
||||
|
||||
ncclResult_t wrap_ibv_symbols(void) {
|
||||
if (ibvState == ibvInitialized)
|
||||
return ncclSuccess;
|
||||
if (ibvState == ibvError)
|
||||
return ncclSystemError;
|
||||
|
||||
if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (ibvState == ibvInitializing) sched_yield();
|
||||
return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
static ncclResult_t initResult;
|
||||
|
||||
static void initOnceFunc(void) {
|
||||
static void* ibvhandle = NULL;
|
||||
void* tmp;
|
||||
void** cast;
|
||||
@@ -111,8 +101,8 @@ ncclResult_t wrap_ibv_symbols(void) {
|
||||
LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
|
||||
LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
|
||||
|
||||
ibvState = ibvInitialized;
|
||||
return ncclSuccess;
|
||||
initResult = ncclSuccess;
|
||||
return;
|
||||
|
||||
teardown:
|
||||
ibv_internal_get_device_list = NULL;
|
||||
@@ -141,8 +131,13 @@ teardown:
|
||||
ibv_internal_event_type_str = NULL;
|
||||
|
||||
if (ibvhandle != NULL) dlclose(ibvhandle);
|
||||
ibvState = ibvError;
|
||||
return ncclSystemError;
|
||||
initResult = ncclSystemError;
|
||||
return;
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_symbols(void) {
|
||||
pthread_once(&initOnceControl, initOnceFunc);
|
||||
return initResult;
|
||||
}
|
||||
|
||||
#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
|
||||
@@ -256,7 +251,7 @@ ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
|
||||
IBV_PTR_CHECK(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
||||
@@ -290,6 +285,7 @@ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint
|
||||
|
||||
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
|
||||
if (ibv_internal_reg_dmabuf_mr == NULL) {
|
||||
errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
|
||||
return NULL;
|
||||
}
|
||||
return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
|
||||
@@ -300,7 +296,7 @@ ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or t
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
|
||||
IBV_PTR_CHECK(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
|
||||
@@ -312,7 +308,7 @@ ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
|
||||
IBV_PTR_CHECK(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
||||
|
||||
@@ -38,6 +38,7 @@ static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void
|
||||
WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
|
||||
return ncclSystemError;
|
||||
}
|
||||
INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath);
|
||||
} else {
|
||||
SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
|
||||
}
|
||||
@@ -81,10 +82,12 @@ ncclResult_t ncclShmUnlink(const char* shmPath) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
|
||||
if (devShmPtr) CUDACHECK(hipHostUnregister(shmPtr));
|
||||
if (munmap(shmPtr, shmSize) != 0) {
|
||||
WARN("munmap of shared memory failed");
|
||||
return ncclSystemError;
|
||||
if (shmPtr) {
|
||||
if (devShmPtr) CUDACHECK(hipHostUnregister(shmPtr));
|
||||
if (munmap(shmPtr, shmSize) != 0) {
|
||||
WARN("munmap of shared memory failed");
|
||||
return ncclSystemError;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+43
-26
@@ -340,10 +340,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
||||
#endif
|
||||
}
|
||||
|
||||
if (sock->asyncFlag) {
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
|
||||
* blocking and non-blocking behavior in user level. */
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
|
||||
// addr port should be 0 (Any port)
|
||||
SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
|
||||
@@ -419,11 +419,10 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
|
||||
const int one = 1;
|
||||
SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
||||
|
||||
/* support non-blocking socket; by default, the socket is non-blocking */
|
||||
if (sock->asyncFlag) {
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
|
||||
* blocking and non-blocking behavior in user level. */
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
|
||||
/* const int bufsize = 128*1024;
|
||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
|
||||
@@ -467,17 +466,26 @@ retry:
|
||||
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
||||
ret = connect(fd, &sock->addr.sa, salen);
|
||||
|
||||
if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
|
||||
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
if (!sock->asyncFlag) {
|
||||
/* blocking socket, need retry if connect fails. */
|
||||
if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY ||
|
||||
(errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||
/* check abortFlag as long as we have chance to retry. */
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
|
||||
* However, it can return EISCONN instead of success which indicates connection is built up in
|
||||
* background already. No need to call connect() again. */
|
||||
if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
|
||||
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
|
||||
* However, it can return EISCONN instead of success which indicates connection is built up in
|
||||
* background already. No need to call connect() again. */
|
||||
if (ret == 0 || errno == EISCONN) {
|
||||
sock->fd = fd;
|
||||
return ncclSuccess;
|
||||
}
|
||||
} else {
|
||||
sock->fd = fd;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -488,17 +496,26 @@ retry:
|
||||
|
||||
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
|
||||
socklen_t socklen = sizeof(union ncclSocketAddress);
|
||||
struct pollfd pollfd;
|
||||
int tmpFd = sock->fd = -1;
|
||||
int pollret;
|
||||
|
||||
do {
|
||||
if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0);
|
||||
pollfd.fd = listenSocket->fd;
|
||||
pollfd.events = POLLIN;
|
||||
retry:
|
||||
if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) {
|
||||
return ncclSystemError;
|
||||
} else {
|
||||
tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
|
||||
} while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag);
|
||||
}
|
||||
|
||||
if (!listenSocket->asyncFlag) {
|
||||
/* blocking socket, if tmpFd is still -1, we need to retry */
|
||||
if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
|
||||
if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError;
|
||||
goto retry;
|
||||
}
|
||||
EQCHECK(tmpFd, -1);
|
||||
} else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
sock->fd = tmpFd;
|
||||
@@ -528,7 +545,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
do {
|
||||
if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
|
||||
if (op == NCCL_SOCKET_RECV && bytes == 0) {
|
||||
*closed = 1;
|
||||
return ncclSuccess;
|
||||
@@ -544,7 +561,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
|
||||
(*offset) += bytes;
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) {
|
||||
INFO(NCCL_NET, "Socket progress: abort called");
|
||||
return ncclSystemError;
|
||||
return ncclInternalError;
|
||||
}
|
||||
} while (bytes > 0 && (*offset) < size);
|
||||
return ncclSuccess;
|
||||
|
||||
+78
-37
@@ -42,7 +42,28 @@ typedef enum { ncclSuccess = 0,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6,
|
||||
ncclNumResults = 7 } ncclResult_t;
|
||||
ncclInProgress = 7,
|
||||
ncclNumResults = 8 } ncclResult_t;
|
||||
|
||||
/* Communicator configuration. Users can assign value to attributes to specify the
|
||||
* behavior of a communicator. */
|
||||
typedef struct ncclConfig_v21400 {
|
||||
/* attributes that users should never touch. */
|
||||
size_t size;
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
/* attributes that users are able to customize. */
|
||||
int blocking;
|
||||
} ncclConfig_t;
|
||||
|
||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||
* Not initialized config will result in NCCL error. */
|
||||
#define NCCL_CONFIG_INITIALIZER { \
|
||||
sizeof(ncclConfig_t), /* size */ \
|
||||
0xcafebeef, /* magic */ \
|
||||
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
||||
1 /* blocking */ \
|
||||
}
|
||||
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
@@ -50,7 +71,7 @@ typedef enum { ncclSuccess = 0,
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int *version);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetVersion(int *version);
|
||||
/// @endcond
|
||||
|
||||
@@ -67,10 +88,17 @@ ncclResult_t pncclGetVersion(int *version);
|
||||
|
||||
*/
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Create a new communicator (multi thread/process version) with a configuration
|
||||
* set by users. */
|
||||
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version).
|
||||
|
||||
@details
|
||||
@@ -85,7 +113,7 @@ ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
communicator struct pointer
|
||||
*/
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
/// @endcond
|
||||
|
||||
@@ -120,25 +148,38 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
|
||||
* Order of devlist defines user-order of processors within the communicator.
|
||||
* */
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object, but waits for any operations that might still be running on the device */
|
||||
/*! @brief Finalize a communicator.
|
||||
* @details ncclCommFinalize flushes all issued communications,
|
||||
* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
|
||||
* when the communicator is globally quiescent and related resources are freed; then,
|
||||
* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
|
||||
* itself) without blocking. */
|
||||
ncclResult_t ncclCommFinalize(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommFinalize(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees local resources associated with communicator object. */
|
||||
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object and aborts any operations that might still be running on the device. */
|
||||
/*! @brief Frees resources associated with communicator object and aborts any operations
|
||||
* that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns a string for each error code. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
/// @endcond
|
||||
|
||||
@@ -146,31 +187,31 @@ const char* pncclGetErrorString(ncclResult_t result);
|
||||
* comm is currently unused and can be set to NULL
|
||||
*/
|
||||
const char* ncclGetLastError(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetError(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/* Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gets the number of ranks in the communicator clique. */
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the rocm device number associated with the communicator. */
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @endcond
|
||||
|
||||
@@ -207,7 +248,7 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10 } ncclDataType_t;
|
||||
|
||||
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
/*! @brief ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
typedef enum {
|
||||
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||
* dereferenced while the collective is running. */
|
||||
@@ -218,9 +259,7 @@ typedef enum {
|
||||
ncclScalarHostImmediate = 1
|
||||
} ncclScalarResidence_t;
|
||||
|
||||
/*
|
||||
* ncclRedOpCreatePreMulSum
|
||||
*
|
||||
/*! @brief ncclRedOpCreatePreMulSum
|
||||
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||
* scalar locally before reducing them with peer values via summation. For use
|
||||
* only with collectives launched against *comm* and *datatype*. The
|
||||
@@ -229,17 +268,19 @@ typedef enum {
|
||||
* is stored in *op*.
|
||||
*/
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* ncclRedOpDestroy
|
||||
*
|
||||
* Destroys the reduction operator *op*. The operator must have been created by
|
||||
/*! @brief ncclRedOpDestroy
|
||||
* @details Destroys the reduction operator *op*. The operator must have been created by
|
||||
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||
*/
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
@@ -267,7 +308,7 @@ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
*/
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -282,7 +323,7 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
|
||||
*/
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -297,7 +338,7 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
|
||||
*/
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -311,7 +352,7 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||
*/
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -330,7 +371,7 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
@@ -347,7 +388,7 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
*/
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -364,7 +405,7 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
|
||||
*/
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -381,7 +422,7 @@ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t dataty
|
||||
*/
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -398,7 +439,7 @@ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, in
|
||||
*/
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -416,7 +457,7 @@ ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
@@ -432,7 +473,7 @@ ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
|
||||
*/
|
||||
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
@@ -451,7 +492,7 @@ ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
@@ -486,7 +527,7 @@ ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
|
||||
* ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupStart();
|
||||
/// @endcond
|
||||
|
||||
@@ -497,7 +538,7 @@ ncclResult_t pncclGroupStart();
|
||||
* need to be called after ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
/// @cond include_hidden
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupEnd();
|
||||
/// @endcond
|
||||
|
||||
|
||||
+2
-6
@@ -335,12 +335,8 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
||||
ncclResult_t ret;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
|
||||
while (sComm == NULL) {
|
||||
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
|
||||
}
|
||||
while (rComm == NULL) {
|
||||
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
|
||||
}
|
||||
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
|
||||
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
|
||||
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
|
||||
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
|
||||
|
||||
+71
-37
@@ -407,17 +407,17 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
|
||||
}
|
||||
} break;
|
||||
case ncclPatternCollTreeUpDown: {
|
||||
// CollTree up
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire)); // For CollTree up, we are using push
|
||||
// CollTree down
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
|
||||
case ncclPatternCollnetChain: {
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
|
||||
} break;
|
||||
case ncclPatternCollnetDirect: {
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
|
||||
} break;
|
||||
case ncclPatternSend:
|
||||
case ncclPatternRecv: {
|
||||
if (op->root == comm->rank) return ncclSuccess;
|
||||
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
|
||||
} break;
|
||||
}
|
||||
@@ -433,16 +433,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
op->channelId = channelId;
|
||||
op->sliceSteps = 1;
|
||||
op->chunkSteps = 1;
|
||||
op->protocol = NCCL_PROTO_SIMPLE;
|
||||
op->dtype = info->datatype;
|
||||
op->protocol = info->protocol;
|
||||
|
||||
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
|
||||
int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
|
||||
|
||||
// If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization
|
||||
if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize;
|
||||
info->chunkSize = stepSize;
|
||||
op->root = info->root;
|
||||
op->nbytes = info->count;
|
||||
struct ncclChannelPeer* peer = channel->peers + op->root;
|
||||
|
||||
struct ncclChannelPeer* peer = channel->peers + op->root;
|
||||
if (info->coll == ncclFuncSend) {
|
||||
op->pattern = ncclPatternSend;
|
||||
if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
|
||||
@@ -465,6 +466,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
info->chunkSize = ncclParamChunkSize();
|
||||
}
|
||||
op->chunkSize = info->chunkSize;
|
||||
|
||||
// Compute nSteps for proxies
|
||||
int chunkEffectiveSize = op->chunkSize;
|
||||
if (op->protocol == NCCL_PROTO_LL) {
|
||||
chunkEffectiveSize /= 2;
|
||||
}
|
||||
|
||||
op->nbytes = stepSize;
|
||||
op->nsteps = DIVUP(info->count, chunkEffectiveSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -617,7 +629,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
|
||||
if (createThreadContext == -1) {
|
||||
createThreadContext = ncclParamCreateThreadContext();
|
||||
if (createThreadContext) {
|
||||
if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
|
||||
if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
|
||||
WARN("Unable to create thread context due to old driver, disabling.");
|
||||
createThreadContext = 0;
|
||||
}
|
||||
@@ -625,7 +637,7 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
|
||||
}
|
||||
if (createThreadContext) {
|
||||
if (comm->proxyState.cudaCtx == NULL) {
|
||||
if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
|
||||
if (CUPFN(cuCtxCreate(&comm->proxyState.cudaCtx,
|
||||
CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to create CUDA context on device %d", comm->cudaDev);
|
||||
createThreadContext = 0;
|
||||
@@ -642,6 +654,9 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
|
||||
NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
|
||||
|
||||
void* ncclProxyProgress(void *comm_) {
|
||||
struct ncclComm* comm = (struct ncclComm*)comm_;
|
||||
if (ncclSetThreadContext(comm) != ncclSuccess) {
|
||||
@@ -653,7 +668,8 @@ void* ncclProxyProgress(void *comm_) {
|
||||
|
||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||
state->nextOps = -1;
|
||||
signal(SIGUSR1, ncclDumpProxyState);
|
||||
const int sig = ncclParamProxyDumpSignal();
|
||||
if (sig != -1) signal(sig, ncclDumpProxyState);
|
||||
ncclLastProxyState = state;
|
||||
char threadName[NCCL_THREAD_NAMELEN];
|
||||
snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
|
||||
@@ -665,7 +681,7 @@ void* ncclProxyProgress(void *comm_) {
|
||||
int idle = 1;
|
||||
ncclResult_t ret = progressOps(comm, state, state->active, &idle);
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
(void) ncclCommSetAsyncError(comm, ret);
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
}
|
||||
@@ -677,7 +693,7 @@ void* ncclProxyProgress(void *comm_) {
|
||||
ret = ncclProxyGetPostedOps(comm, &added);
|
||||
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
(void) ncclCommSetAsyncError(comm, ret);
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
}
|
||||
if (added == 0) {
|
||||
@@ -783,9 +799,13 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
|
||||
|
||||
static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
if (connection->send) {
|
||||
NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
|
||||
if (ncclTransports[connection->transport]->send.proxyFree) {
|
||||
NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
|
||||
if (ncclTransports[connection->transport]->recv.proxyFree) {
|
||||
NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -794,7 +814,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
|
||||
for (int b=0; b<pool->banks; b++) {
|
||||
int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
|
||||
for (int i=0; i<max; i++) {
|
||||
NCCLCHECK(proxyFree(pool->pools[b]+i, comm));
|
||||
ncclProxyConnection *connection = pool->pools[b]+i;
|
||||
if (connection->initFlag == true) {
|
||||
NCCLCHECK(proxyFree(connection, comm));
|
||||
}
|
||||
}
|
||||
free(pool->pools[b]);
|
||||
}
|
||||
@@ -813,8 +836,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
|
||||
for (int r=0; r<comm->localRanks; r++) {
|
||||
comm->proxyState.peerSocks[r].fd = -1;
|
||||
comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag;
|
||||
NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
|
||||
@@ -944,6 +966,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
|
||||
}
|
||||
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
|
||||
__atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1020,7 +1043,8 @@ void* ncclProxyService(void* _args) {
|
||||
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
||||
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
||||
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
||||
peers[s].sock.fd = pollfds[s].fd = -1;
|
||||
ncclSocketInit(&peers[s].sock, NULL, comm->abortFlag, 0);
|
||||
pollfds[s].fd = -1;
|
||||
pollfds[s].events = POLLHUP|POLLIN;
|
||||
}
|
||||
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
|
||||
@@ -1030,8 +1054,9 @@ void* ncclProxyService(void* _args) {
|
||||
int npeers = 0;
|
||||
int stop = 0;
|
||||
int asyncOpCount = 0;
|
||||
while (stop == 0 || (stop == 1 && npeers > 0)) {
|
||||
if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) {
|
||||
while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
|
||||
WARN("[Proxy Service] Poll failed with error %d", error);
|
||||
return NULL;
|
||||
}
|
||||
@@ -1072,10 +1097,7 @@ void* ncclProxyService(void* _args) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
|
||||
closeConn = 1;
|
||||
} else {
|
||||
if (type == ncclProxyMsgAbort) {
|
||||
stop = 2;
|
||||
closeConn = 1;
|
||||
} else if (type == ncclProxyMsgStop) {
|
||||
if (type == ncclProxyMsgStop) {
|
||||
stop = 1;
|
||||
closeConn = 1;
|
||||
} else if (type == ncclProxyMsgClose) {
|
||||
@@ -1105,6 +1127,10 @@ void* ncclProxyService(void* _args) {
|
||||
}
|
||||
}
|
||||
}
|
||||
/* wait until main thread flush all NCCL operations. */
|
||||
while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0)
|
||||
usleep(1000);
|
||||
|
||||
// Wait for all operations to complete and stop progress thread before freeing any resource
|
||||
if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
|
||||
WARN("[Proxy Service] proxyDestroy failed");
|
||||
@@ -1134,15 +1160,23 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
|
||||
if (state == NULL) return ncclSuccess;
|
||||
if (state->peerAddresses) {
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = NULL;
|
||||
sock.asyncFlag = 0;
|
||||
memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketConnect(&sock));
|
||||
int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop;
|
||||
NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
|
||||
close(sock.fd);
|
||||
if (*comm->abortFlag == 0) {
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = NULL;
|
||||
sock.asyncFlag = 0;
|
||||
memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketConnect(&sock));
|
||||
int type = ncclProxyMsgStop;
|
||||
NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
|
||||
close(sock.fd);
|
||||
} else {
|
||||
/* when abortFlag is set, all socket related communications are no longer reliable. We need to
|
||||
* set a flag to let proxy thread exit. */
|
||||
__atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE);
|
||||
}
|
||||
free(state->peerAddresses);
|
||||
}
|
||||
if (state->peerSocks) {
|
||||
|
||||
@@ -101,6 +101,7 @@ struct sendResources {
|
||||
int nranks;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int useDmaBuf;
|
||||
uint64_t* gdcSync;
|
||||
void* gdrDesc;
|
||||
void* sendMhandles[NCCL_NUM_PROTOCOLS];
|
||||
@@ -121,6 +122,7 @@ struct recvResources {
|
||||
int nranks;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int useDmaBuf;
|
||||
uint64_t* gdcSync;
|
||||
uint64_t* gdcFlush;
|
||||
void* gdrDesc;
|
||||
@@ -157,7 +159,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -175,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -284,6 +286,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
|
||||
|
||||
resources->netDev = req->netDev;
|
||||
resources->useGdr = req->useGdr;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
|
||||
/* DMA-BUF support */
|
||||
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -389,6 +395,10 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
||||
|
||||
resources->netDev = req->netDev;
|
||||
resources->useGdr = req->useGdr;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
|
||||
/* DMA-BUF support */
|
||||
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
|
||||
|
||||
collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
|
||||
if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
|
||||
@@ -452,7 +462,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
if (resources->useGdr && comm->dmaBufSupport) {
|
||||
if (resources->useGdr && resources->useDmaBuf) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
@@ -521,7 +531,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
if (resources->useGdr && comm->dmaBufSupport) {
|
||||
if (resources->useGdr && resources->useDmaBuf) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
@@ -548,35 +558,41 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
|
||||
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->sendMhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
|
||||
|
||||
if (resources) {
|
||||
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->sendMhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||
CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
||||
NCCLCHECK(sharedBuffersDestroy(comm));
|
||||
NCCLCHECK(sharedFree(comm, resources->netDev));
|
||||
free(connection->transportResources);
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||
CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
||||
NCCLCHECK(sharedBuffersDestroy(comm));
|
||||
NCCLCHECK(sharedFree(comm, resources->netDev));
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->mhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
|
||||
|
||||
if (resources) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->mhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||
CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
||||
NCCLCHECK(sharedBuffersDestroy(comm));
|
||||
NCCLCHECK(sharedFree(comm, resources->netDev));
|
||||
free(connection->transportResources);
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||
CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
|
||||
NCCLCHECK(sharedBuffersDestroy(comm));
|
||||
NCCLCHECK(sharedFree(comm, resources->netDev));
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -585,10 +601,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
if (args->protocol != NCCL_PROTO_SIMPLE) {
|
||||
WARN("CollNet does not support LL/LL128");
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
@@ -603,7 +615,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
@@ -704,10 +716,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
if (args->protocol != NCCL_PROTO_SIMPLE) {
|
||||
WARN("CollNet does not support LL/LL128");
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
@@ -721,7 +729,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
@@ -755,7 +763,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
|
||||
sub->received += args->sliceSteps;
|
||||
sub->requests[buffSlot] = NULL;
|
||||
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
|
||||
if (1 && reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
|
||||
// GDRCOPY support
|
||||
if (resources->gdcFlush) {
|
||||
#if defined (__x86_64__)
|
||||
|
||||
+38
-23
@@ -96,6 +96,7 @@ struct sendResources {
|
||||
int remoteRank;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int useDmaBuf;
|
||||
int maxRecvs;
|
||||
uint64_t* gdcSync;
|
||||
void* gdrDesc;
|
||||
@@ -123,6 +124,7 @@ struct recvResources {
|
||||
int proxyRank;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int useDmaBuf;
|
||||
int maxRecvs;
|
||||
uint64_t* gdcSync;
|
||||
uint64_t* gdcFlush;
|
||||
@@ -138,19 +140,16 @@ struct recvResources {
|
||||
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// Same host?
|
||||
if (info1->hostHash == info2->hostHash) {
|
||||
// User disabled NET for intra-node?
|
||||
if (ncclParamNetDisableIntra() == 1) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
*ret = 1;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
if (info1->hostHash == info2->hostHash) {
|
||||
// If on the same host, check intra-node net is not disabled.
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -354,12 +353,15 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
static ncclResult_t sendFree(struct ncclConnector* send) {
|
||||
struct connectMap* map = (struct connectMap*)(send->transportResources);
|
||||
if (map->sameProcess == 0) {
|
||||
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
CUDACHECK(hipIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
if (map) {
|
||||
if (map->sameProcess == 0) {
|
||||
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
CUDACHECK(hipIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -386,7 +388,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
|
||||
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
|
||||
state->refcount++;
|
||||
if (state->size == 0) {
|
||||
state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
|
||||
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize;
|
||||
}
|
||||
|
||||
if (size) *size = state->size;
|
||||
@@ -412,9 +414,8 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
|
||||
|
||||
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
|
||||
// Use different pools for different channels and also separate send/recv.
|
||||
int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
|
||||
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
|
||||
*offset = slotSize * globalSlot;
|
||||
*offset = comm->p2pNetChunkSize * globalSlot;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -467,6 +468,8 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
|
||||
resources->curr_hdp_reg = req->curr_hdp_reg;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
|
||||
/* DMA-BUF support */
|
||||
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
|
||||
resources->maxRecvs = props.maxRecvs;
|
||||
|
||||
// We don't return any data
|
||||
@@ -493,6 +496,8 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
||||
resources->connIndex = req->connIndex;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
|
||||
/* DMA-BUF support */
|
||||
resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
|
||||
resources->maxRecvs = props.maxRecvs;
|
||||
|
||||
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
||||
@@ -560,6 +565,12 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
|
||||
&mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
|
||||
resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
|
||||
|
||||
if (comm->allocP2pNetLLBuffers) {
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
|
||||
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
|
||||
}
|
||||
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
|
||||
}
|
||||
|
||||
@@ -608,7 +619,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
|
||||
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
|
||||
@@ -707,6 +718,11 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
|
||||
|
||||
if (comm->allocP2pNetLLBuffers) {
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
|
||||
resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
|
||||
}
|
||||
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
if (resources->shared == 0) {
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr));
|
||||
@@ -737,7 +753,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
|
||||
if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
|
||||
@@ -899,12 +915,11 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = sizesFifo[buffSlot];
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
sub->npKitSizesFifo[buffSlot] = size;
|
||||
#endif
|
||||
|
||||
char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
|
||||
char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
ready = resources->useGdr;
|
||||
@@ -1074,7 +1089,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
if (resources->shared) {
|
||||
if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
|
||||
|
||||
@@ -27,7 +27,6 @@
|
||||
#include "ibvwrap.h"
|
||||
#include "graph/xml.h"
|
||||
|
||||
#define USE_RDMA_WRITE 1
|
||||
#define MAXNAMESIZE 64
|
||||
static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union ncclSocketAddress ncclIbIfAddr;
|
||||
@@ -75,7 +74,7 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int ncclIbRelaxedOrderingEnabled = 0;
|
||||
|
||||
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
|
||||
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
|
||||
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
|
||||
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
|
||||
@@ -310,8 +309,8 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
|
||||
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
|
||||
// Test kernel DMA-BUF support with a dummy call (fd=-1)
|
||||
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
|
||||
dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
|
||||
dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
|
||||
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
|
||||
}
|
||||
if (dmaBufSupported == 0) return ncclSystemError;
|
||||
@@ -718,7 +717,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
||||
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
|
||||
stage->comm = rComm;
|
||||
stage->state = ncclIbCommStateAccept;
|
||||
lComm->sock.asyncFlag = 1;
|
||||
NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1));
|
||||
|
||||
ib_accept:
|
||||
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
|
||||
|
||||
@@ -317,7 +317,6 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
|
||||
handle->nSocks = comm->nSocks;
|
||||
handle->nThreads = comm->nThreads;
|
||||
comm->sock.asyncFlag = 1;
|
||||
comm->dev = dev;
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
@@ -394,7 +393,7 @@ ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
|
||||
for (; i<rComm->nSocks+1; i++) {
|
||||
uint8_t sendSockIdx;
|
||||
ncclCalloc(&sock, 1);
|
||||
NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1));
|
||||
NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1));
|
||||
stage->sock = sock;
|
||||
stage->state = ncclSocketCommStateAccept;
|
||||
stage->iteration = i;
|
||||
|
||||
+35
-21
@@ -117,6 +117,14 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Check if NET would work better
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
if (useNet) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
@@ -258,17 +266,17 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
||||
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
} else {
|
||||
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
|
||||
}
|
||||
} else {
|
||||
info->rank = intermediateRank;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
|
||||
}
|
||||
|
||||
@@ -402,20 +410,24 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
|
||||
ncclResult_t p2pSendFree(struct ncclConnector* send) {
|
||||
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
|
||||
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
|
||||
free(resources);
|
||||
if (resources) {
|
||||
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
|
||||
free(resources);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
|
||||
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
|
||||
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
|
||||
if (resources) {
|
||||
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
|
||||
}
|
||||
free(resources);
|
||||
}
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -492,14 +504,16 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection,
|
||||
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
if (useMemcpy) {
|
||||
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
|
||||
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
|
||||
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
||||
CUDACHECK(hipFree(proxyInfo->ceDevBuff));
|
||||
CUDACHECK(hipStreamDestroy(proxyInfo->stream));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(proxyInfo->events[i]));
|
||||
if (proxyInfo) {
|
||||
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
|
||||
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
||||
CUDACHECK(hipFree(proxyInfo->ceDevBuff));
|
||||
CUDACHECK(hipStreamDestroy(proxyInfo->stream));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(proxyInfo->events[i]));
|
||||
}
|
||||
free(proxyInfo);
|
||||
}
|
||||
free(proxyInfo);
|
||||
} else {
|
||||
// Do not check return code as CUDA may have already shut down
|
||||
hipFree(connection->transportResources);
|
||||
|
||||
+32
-18
@@ -49,6 +49,10 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
|
||||
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
if (useNet) return ncclSuccess;
|
||||
|
||||
// Same host?
|
||||
TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
@@ -191,17 +195,21 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
|
||||
static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
free(resources);
|
||||
if (resources) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
free(resources);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
free(resources);
|
||||
if (resources) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
free(resources);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -243,25 +251,31 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection,
|
||||
|
||||
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
|
||||
if (resources) {
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
}
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
|
||||
if (resources) {
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
}
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,16 +12,17 @@
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 11
|
||||
#define NCCL_PATCH 4
|
||||
#define NCCL_MINOR 14
|
||||
#define NCCL_PATCH 3
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 21104
|
||||
#define NCCL_VERSION_CODE 21403
|
||||
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
#define RCCL_MULTIRANKPERGPU 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -40,7 +41,29 @@ typedef enum { ncclSuccess = 0,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6 } ncclResult_t;
|
||||
ncclRemoteError = 6,
|
||||
ncclInProgress = 7,
|
||||
ncclNumResults = 8 } ncclResult_t;
|
||||
|
||||
/* Communicator configuration. Users can assign value to attributes to specify the
|
||||
* behavior of a communicator. */
|
||||
typedef struct ncclConfig_v21400 {
|
||||
/* attributes that users should never touch. */
|
||||
size_t size;
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
/* attributes that users are able to customize. */
|
||||
int blocking;
|
||||
} ncclConfig_t;
|
||||
|
||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||
* Not initialized config will result in NCCL error. */
|
||||
#define NCCL_CONFIG_INITIALIZER { \
|
||||
sizeof(ncclConfig_t), /* size */ \
|
||||
0xcafebeef, /* magic */ \
|
||||
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
||||
1 /* blocking */ \
|
||||
}
|
||||
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
@@ -69,6 +92,13 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Create a new communicator (multi thread/process version) with a configuration
|
||||
* set by users. */
|
||||
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version).
|
||||
|
||||
@details
|
||||
@@ -87,6 +117,28 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device.
|
||||
|
||||
@details
|
||||
rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
Each rank is associated to a HIP device, which has to be set before calling
|
||||
ncclCommInitRankMulti.
|
||||
Since this version of the function allows multiple ranks to utilize the same
|
||||
HIP device, a unique virtualId per device has to be provided by each calling
|
||||
rank.
|
||||
ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be
|
||||
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
|
||||
|
||||
@param[in]
|
||||
comm ncclComm_t*
|
||||
communicator struct pointer
|
||||
*/
|
||||
ncclResult_t ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
|
||||
/// @endcond
|
||||
|
||||
|
||||
/*! @brief Creates a clique of communicators (single process version).
|
||||
*
|
||||
* @details This is a convenience function to create a single-process communicator clique.
|
||||
@@ -100,23 +152,46 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object, but waits for any operations that might still be running on the device */
|
||||
/*! @brief Finalize a communicator.
|
||||
* @details ncclCommFinalize flushes all issued communications,
|
||||
* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
|
||||
* when the communicator is globally quiescent and related resources are freed; then,
|
||||
* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
|
||||
* itself) without blocking. */
|
||||
ncclResult_t ncclCommFinalize(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommFinalize(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees local resources associated with communicator object. */
|
||||
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object and aborts any operations that might still be running on the device. */
|
||||
/*! @brief Frees resources associated with communicator object and aborts any operations
|
||||
* that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns a human-readable error message. */
|
||||
/*! @brief Returns a string for each error code. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors */
|
||||
/*! @brief Returns a human-readable message of the last error that occurred.
|
||||
* comm is currently unused and can be set to NULL
|
||||
*/
|
||||
const char* ncclGetLastError(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetError(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/* Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
@@ -173,7 +248,7 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10 } ncclDataType_t;
|
||||
|
||||
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
/*! @brief ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
typedef enum {
|
||||
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||
* dereferenced while the collective is running. */
|
||||
@@ -184,9 +259,7 @@ typedef enum {
|
||||
ncclScalarHostImmediate = 1
|
||||
} ncclScalarResidence_t;
|
||||
|
||||
/*
|
||||
* ncclRedOpCreatePreMulSum
|
||||
*
|
||||
/*! @brief ncclRedOpCreatePreMulSum
|
||||
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||
* scalar locally before reducing them with peer values via summation. For use
|
||||
* only with collectives launched against *comm* and *datatype*. The
|
||||
@@ -195,17 +268,19 @@ typedef enum {
|
||||
* is stored in *op*.
|
||||
*/
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* ncclRedOpDestroy
|
||||
*
|
||||
* Destroys the reduction operator *op*. The operator must have been created by
|
||||
/*! @brief ncclRedOpDestroy
|
||||
* @details Destroys the reduction operator *op*. The operator must have been created by
|
||||
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||
*/
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
@@ -345,11 +420,11 @@ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t dataty
|
||||
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||
* ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gather
|
||||
|
||||
@@ -13,8 +13,8 @@ struct ncclGraphInfo {
|
||||
int pattern;
|
||||
int nChannels;
|
||||
int sameChannels;
|
||||
float speedIntra;
|
||||
float speedInter;
|
||||
float bwIntra;
|
||||
float bwInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
};
|
||||
@@ -28,6 +28,7 @@ struct allGather3Data_t{
|
||||
struct ncclGraphInfo collNet;
|
||||
struct ncclTopoRanks topoRanks;
|
||||
bool pivotA2AEnabled;
|
||||
bool ll128Enabled;
|
||||
};
|
||||
|
||||
void initCollNet();
|
||||
|
||||
@@ -75,14 +75,39 @@ int busIdToCudaDev(int64_t busId) {
|
||||
return node_model->busIdToCudaDev(busId);
|
||||
}
|
||||
|
||||
static int useMemcpy = 0;
|
||||
|
||||
/* Determine if two peers can communicate with P2P */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// Rule out different nodes
|
||||
*ret = 0;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
*ret = node_model->p2pCanConnect(cudaDev1, cudaDev2);
|
||||
if (!info1->hasFineGrain || !info2->hasFineGrain) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Rule out different nodes / isolated containers
|
||||
if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Check topology / p2p level.
|
||||
int intermediateRank;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
if (intermediateRank != -1) {
|
||||
if (useMemcpy) *ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Check if NET would work better
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
if (useNet) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -118,14 +143,26 @@ struct ncclTransport p2pTransport = {
|
||||
{ p2pRecvSetup, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
|
||||
|
||||
/* Determine if two peers can communicate with SHM */
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// Rule out different nodes
|
||||
*ret = 0;
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
if (useNet) return ncclSuccess;
|
||||
|
||||
// Same host?
|
||||
TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
*ret = node_model->shmCanConnect(cudaDev1, cudaDev2);
|
||||
|
||||
// Common /dev/shm (between containers) ?
|
||||
TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
|
||||
if (info1->shmDev != info2->shmDev) return ncclSuccess;
|
||||
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -161,7 +198,7 @@ struct setupReq {
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = node_model->netCanConnect(info1->rank, info2->rank);
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -224,6 +224,7 @@ int main(int argc,char* argv[])
|
||||
comm[i].topo = node_model->getSystem(i);
|
||||
comm[i].peerInfo = peerInfo;
|
||||
comm[i].ncclNet = ncclNet;
|
||||
comm[i].virtualId = -1;
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
|
||||
NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
|
||||
|
||||
@@ -39,7 +39,6 @@ extern NodeModel *node_model;
|
||||
|
||||
NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
|
||||
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
||||
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
ncclCollNet_t* ncclCollNet = NULL;
|
||||
@@ -462,7 +461,10 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
|
||||
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||
RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
|
||||
NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
|
||||
RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0);
|
||||
|
||||
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
|
||||
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
|
||||
@@ -484,13 +486,30 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
//NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, comm->rank));
|
||||
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
|
||||
return ncclInvalidUsage;
|
||||
//If virtualId == -1 multiRank support has not been requested by user, using original interface
|
||||
if (comm->virtualId == -1) {
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
//Multiple ranks can use the same device, but need to have different virtualId's.
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
for (int j=0; j < nranks; j++) {
|
||||
if (j==i) continue;
|
||||
if((comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash) &&
|
||||
(comm->peerInfo[i].busId == comm->peerInfo[j].busId) &&
|
||||
(comm->peerInfo[i].virtualId == comm->peerInfo[j].virtualId)) {
|
||||
WARN("Duplicate virtualId detected : rank %d and rank %d both on GPU device %lx virtualId %d",
|
||||
i, j, comm->peerInfo[rank].busId, comm->peerInfo[i].virtualId);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AllGather1 - end
|
||||
|
||||
// Topo detection / System graph creation
|
||||
@@ -502,6 +521,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
// init Pivot A2A related fields
|
||||
comm->topo->pivotA2AEnabled = false;
|
||||
comm->topo->pivotA2ANumBiRings = 0;
|
||||
// LL128
|
||||
comm->topo->ll128Enabled = false;
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
@@ -573,6 +594,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
// Initialize num P2P LL buffers for this communicator
|
||||
comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
|
||||
|
||||
if (comm->rank == ncclParamGraphDumpFileRank()) {
|
||||
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
|
||||
@@ -605,8 +628,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
int pattern;
|
||||
int nChannels;
|
||||
int sameChannels;
|
||||
float speedIntra;
|
||||
float speedInter;
|
||||
float bwIntra;
|
||||
float bwInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
};
|
||||
@@ -620,6 +643,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
struct ncclGraphInfo collNet;
|
||||
struct ncclTopoRanks topoRanks;
|
||||
bool pivotA2AEnabled;
|
||||
bool ll128Enabled;
|
||||
} *allGather3Data;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
||||
@@ -627,13 +651,19 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
int idx;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
|
||||
allGather3Data[rank].nc = 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) ||
|
||||
(comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
|
||||
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
|
||||
if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) ||
|
||||
(comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
|
||||
(comm->topo->type & RCCL_TOPO_CR8G))
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
if (((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1) ||
|
||||
(comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
|
||||
@@ -643,26 +673,28 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
allGather3Data[rank].tree.pattern = treeGraph.pattern;
|
||||
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
|
||||
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
|
||||
allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra;
|
||||
allGather3Data[rank].tree.bwInter = treeGraph.bwInter;
|
||||
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
|
||||
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
|
||||
allGather3Data[rank].ring.pattern = ringGraph.pattern;
|
||||
allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
|
||||
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
|
||||
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
|
||||
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
|
||||
allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra;
|
||||
allGather3Data[rank].ring.bwInter = ringGraph.bwInter;
|
||||
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
|
||||
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
|
||||
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
|
||||
allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
|
||||
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
|
||||
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
|
||||
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
|
||||
allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra;
|
||||
allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter;
|
||||
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
|
||||
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
|
||||
allGather3Data[rank].collNetSupport = comm->collNetSupport;
|
||||
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
|
||||
comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable();
|
||||
allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled;
|
||||
|
||||
comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
|
||||
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
|
||||
@@ -738,24 +770,25 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
// Make sure we align all ranks so that the tuning is consistent across ranks
|
||||
treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
|
||||
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
|
||||
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
|
||||
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
|
||||
treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra);
|
||||
treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter);
|
||||
treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
|
||||
treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
|
||||
ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
|
||||
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
|
||||
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
|
||||
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
|
||||
ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra);
|
||||
ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter);
|
||||
ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
|
||||
ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
|
||||
collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
|
||||
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
|
||||
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
|
||||
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
|
||||
collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra);
|
||||
collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter);
|
||||
collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
|
||||
collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
|
||||
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
|
||||
comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
|
||||
comm->topo->ll128Enabled = comm->topo->ll128Enabled && allGather3Data[i].ll128Enabled;
|
||||
}
|
||||
|
||||
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
|
||||
@@ -787,7 +820,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc));
|
||||
|
||||
if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
|
||||
if (comm->topo->pivotA2ANumBiRings == 3) {
|
||||
NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
|
||||
NCCLCHECK(ncclBinaryTreePostset(comm, &treeGraph));
|
||||
}
|
||||
|
||||
free(allTopoRanks);
|
||||
free(nodesTreePatterns);
|
||||
@@ -798,16 +834,25 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
char line[1024];
|
||||
char line[1024], binline[1024];
|
||||
line[0]='\0';
|
||||
binline[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* tree = &comm->channels[c].tree;
|
||||
struct ncclTree* binTree = &comm->channels[c].binTree;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
|
||||
if (comm->topo->pivotA2ANumBiRings == 3)
|
||||
snprintf(binline+strlen(binline), 1023-strlen(binline), " [%d] %d/%d/%d->%d->%d",
|
||||
c, binTree->down[0], binTree->down[1], binTree->down[2], rank, binTree->up);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Trees%s", line);
|
||||
if (comm->topo->pivotA2ANumBiRings == 3) {
|
||||
binline[1023] = '\0';
|
||||
INFO(NCCL_INIT, "BinTrees%s", binline);
|
||||
}
|
||||
|
||||
//NCCLCHECK(computeBuffSizes(comm));
|
||||
|
||||
@@ -838,6 +883,11 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
|
||||
// RCCL: need to connect binTree as well
|
||||
if (comm->topo->pivotA2ANumBiRings == 3) {
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->binTree.down, 1, &channel->binTree.up, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->binTree.up, NCCL_MAX_TREE_ARITY, channel->binTree.down, 0), ret, affinity_restore);
|
||||
}
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
|
||||
INFO(NCCL_INIT, "Connected all trees");
|
||||
@@ -870,16 +920,38 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
|
||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||
|
||||
// Connect intra-node CollNet
|
||||
char line[1024];
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* chain = &comm->channels[c].collnetChain;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d",
|
||||
c, chain->down[0], rank, chain->up);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Collnet Chains %s", line);
|
||||
// Connect Collnet + chain
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||
INFO(NCCL_INIT, "Connected collnet + chain");
|
||||
|
||||
// Connect intra-node CollNet + Direct
|
||||
int highestTransportType0, highestTransportType1;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
|
||||
|
||||
@@ -1057,6 +1129,8 @@ collnet_cleanup:
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
|
||||
|
||||
/* Local intra-node barrier */
|
||||
//NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user