Merge remote-tracking branch 'nccl/master' into develop

Этот коммит содержится в:
Wenkai Du
2021-04-30 16:57:36 -07:00
родитель 4f8e788a61 a46ea10583
Коммит a4ea1fed5b
80 изменённых файлов: 3136 добавлений и 1675 удалений
+2 -2
Просмотреть файл
@@ -203,8 +203,8 @@ if("${HIP_COMPILER}" MATCHES "clang")
find_program( hipcc_executable hipcc )
execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs)
if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs")
target_compile_options(rccl PRIVATE -parallel-jobs=4 PRIVATE -Wno-format-nonliteral)
target_link_libraries(rccl PRIVATE -parallel-jobs=4)
target_compile_options(rccl PRIVATE -parallel-jobs=8 PRIVATE -Wno-format-nonliteral)
target_link_libraries(rccl PRIVATE -parallel-jobs=8)
endif()
# RCCL static lib uses -fgpu-rdc which requires hipcc as the linker and archiver
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 8
NCCL_PATCH := 4
NCCL_MINOR := 9
NCCL_PATCH := 6
NCCL_SUFFIX :=
PKG_REVISION := 1
+5 -5
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -56,8 +56,8 @@ ALWAYS_REBUILD:
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
$(INCDIR)/nccl.h : nccl.h.in
# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
mkdir -p $(INCDIR)
@printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
@@ -129,7 +129,7 @@ install : lib
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.
+14 -10
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -222,6 +222,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
struct unexConn {
int peer;
int tag;
int fd;
struct unexConn* next;
};
@@ -445,21 +446,23 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
return ncclSuccess;
}
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
struct extState* state = (struct extState*)commState;
int tmpSendFd;
NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &tag, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
close(tmpSendFd);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd) {
// New unex
struct unexConn* unex;
NCCLCHECK(ncclCalloc(&unex, 1));
unex->peer = peer;
unex->tag = tag;
unex->fd = fd;
// Enqueue
@@ -473,11 +476,11 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
return ncclSuccess;
}
int unexpectedDequeue(struct extState* state, int peer) {
int unexpectedDequeue(struct extState* state, int peer, int tag) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
if (elem->peer == peer) {
if (elem->peer == peer && elem->tag == tag) {
if (prev == NULL) {
state->unexpectedConnections = elem->next;
} else {
@@ -494,13 +497,13 @@ int unexpectedDequeue(struct extState* state, int peer) {
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
struct extState* state = (struct extState*)commState;
int tmpRecvFd;
// Search unexpected connections first
if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
if ((tmpRecvFd = unexpectedDequeue(state, peer, tag)) != -1) {
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
@@ -509,15 +512,16 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
// Then look for new connections
while (1) {
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
int newPeer;
int newPeer, newTag;
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
if (newPeer == peer) {
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newTag, sizeof(int)));
if (newPeer == peer && newTag == tag) {
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
}
// Unexpected connection. Save for later.
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd));
}
}
+31 -14
Просмотреть файл
@@ -1,12 +1,16 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "channel.h"
#include "param.h"
#include "gdrwrap.h"
// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
@@ -21,14 +25,25 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
for (size_t i=0; i<comm->nRanks+1; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
channel->peers[i].p2pSend.comm = comm;
channel->peers[i].p2pRecv.comm = comm;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
channel->peers[i].send[b].comm = comm;
channel->peers[i].recv[b].comm = comm;
}
}
// Per-channel operation list.
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
// GDRCOPY support
// We allocate a workFifo in GDR mapped CUDA memory
// But we still allocate the Host workFifo so that we
// can copy the work elements to CUDA memory on kernel launch
NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
} else {
// The device workFifo is the Host one
channel->workFifoDev = channel->workFifo;
}
return ncclSuccess;
}
@@ -36,6 +51,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (channel->id == -1) return ncclSuccess;
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
if (channel->gdrMemDesc) {
// GDRCOPY support
NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
}
// Free Ring index to rank tables
free(channel->ring.userRanks);
@@ -45,17 +64,15 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
// Note: free all send resources first due to CollNet arrangement
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->send.transportResources == peer->p2pSend.transportResources) peer->p2pSend.transportResources = NULL;
peer->send.transportResources = NULL;
if (peer->p2pSend.transportResources) NCCLCHECK(peer->p2pSend.transportComm->free(peer->p2pSend.transportResources));
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources));
}
}
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
if (peer->recv.transportResources == peer->p2pRecv.transportResources) peer->p2pRecv.transportResources = NULL;
peer->recv.transportResources = NULL;
if (peer->p2pRecv.transportResources) NCCLCHECK(peer->p2pRecv.transportComm->free(peer->p2pRecv.transportResources));
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->recv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources));
}
}
// Free the peer structures.
+1 -1
Просмотреть файл
@@ -350,7 +350,7 @@ ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args)
}
// Prepare clique argments (NOTE: clique pointers are not ready yet)
int opIndex = args->opCount % NCCL_MAX_OPS;
int opIndex = args->op.opCount % NCCL_MAX_OPS;
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
return ncclSuccess;
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+52 -86
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -169,7 +169,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T
}
#else
int nthreadsSplit = nthreads/2;
if (nthreadsSplit == 256) nthreadsSplit += 64;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
if (tree->up == -1) {
if (tid < nthreads) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
@@ -218,59 +218,78 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
#define COLLNET_COPY_THREADS 64
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
//const int nthreads = args->nThreads-3*WARP_SIZE;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->collTree;
struct ncclDirect* tree = &channel->collTree;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
const int nThreadsScatter = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0;
const int nThreadsGather = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0;
const int nThreadsBcast = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS;
// Gather does not need sync threads, sparing one more warp for reduce
const int nThreadsReduce = NCCL_SIMPLE_MAX_NTHREADS - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int tidStartBcast = nThreadsGather;
const int tidStartScatter = tidStartBcast + nThreadsBcast;
const int tidStartReduce = tidStartScatter + nThreadsScatter;
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
ncclPrimitives<UNROLL, 1, 1, T, 0, NCCL_MAX_DIRECT_ARITY, 0, FUNC>
prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 2);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
int nelem = min(tree->nHeads*chunkSize, size-offset);
prims.scatter(thisInput+offset, nelem, chunkSize, tree->headRank, tree->shift);
}
} else if (tid >= tidStartReduce && tree->out != -1) {
// Reduce, send to network
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DIRECT_ARITY, 1, 0, FUNC>
prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, NULL, stepSize, channel, comm, ncclShmem->ptrs, 3);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
if (hasDn) {
prims.recvReduceSend(thisInput+offset, nelem);
} else {
prims.send(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels && blockIdx.x < 2*nChannels) { // second half of the channels do broadcast
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
} else if (tid < tidStartBcast && hasUp) {
// Gather
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DIRECT_ARITY, 0, 0, FUNC>
prims(tid, nThreadsGather, tree->up, NULL, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
int nelem = min(tree->nHeads*chunkSize, size-offset);
prims.gather(thisOutput+offset, nelem, chunkSize, tree->headRank, tree->shift);
}
} else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) {
// Recv from network, broadcast
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DIRECT_ARITY, 0, FUNC>
prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.recv(thisOutput+offset, nelem);
} else {
if (hasDn) {
prims.recvCopySend(thisOutput+offset, nelem);
} else {
prims.recv(thisOutput+offset, nelem);
}
}
}
@@ -417,60 +436,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL, FUNC, T, UN
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->collTree;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels && blockIdx.x < 2*nChannels) { // second half of the channels do broadcast
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
__device__ void run(struct ncclWorkElem* args) { }
};
#include "prims_ll128.h"
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+9 -10
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -181,9 +181,8 @@ static __device__ void load_parallel(void* dst, void* src, size_t size, int tid)
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
}
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
__syncthreads();
load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork *hostWork, struct ncclWork* workFifo, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
load_parallel(localWork, workFifo, sizeof(struct ncclWork), tid);
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
exitIfAbortBarrier(abort, abortCount);
@@ -201,7 +200,7 @@ class ncclFunction {
#define traceColl(fIdx) \
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
comm->collTrace[pos].opCount = w->opCount; \
comm->collTrace[pos].opCount = w->op.opCount; \
comm->collTrace[pos].bid = bid; \
comm->collTrace[pos].funcIndex = fIdx; \
if (fIdx == FUNC_INDEX_P2P) { \
@@ -246,8 +245,8 @@ class ncclFunction {
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclShmemPtrs {
void* srcs[NCCL_MAX_DEV_ARITY+1];
void* dsts[NCCL_MAX_DEV_ARITY+1];
void* srcs[NCCL_MAX_DIRECT_ARITY+1];
void* dsts[NCCL_MAX_DIRECT_ARITY+1];
uint64_t barrier;
uint64_t barrier_next[MAXWARPS];
};
@@ -287,7 +286,6 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
struct ncclDevComm* comm = first.comm;
struct ncclChannel* channel = comm->channels+bid;
struct ncclWorkElem* w = NULL;
uint16_t index = first.index;
bool firstLaunch = true;
if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
@@ -295,7 +293,8 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
while (1) {
if (w == NULL) {
w = shmem.localWork.elems;
if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
__syncthreads();
if (!load_coll(&shmem.localWork, channel->workFifo+channel->index, channel->workFifoDev+channel->index, tid, comm, &abortCount)) {
if (COLLTRACE && tid == 0) traceAbort(0xffff);
return;
}
@@ -315,7 +314,7 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
NCCL_CALL_FUNCTIONS(w);
}
}
index = (index+1) % NCCL_MAX_OPS;
if (tid == 0) channel->index = (channel->index+1) % NCCL_MAX_OPS;
if (w->active == 2) {
if (COLLTRACE && tid == 0) traceCollEnd(0xffff);
return;
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+77 -14
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -71,11 +71,11 @@ class ncclPrimitives {
int peer = -1;
int role = 0;
int group;
const int p2p;
uint64_t step;
T* direct = NULL;
T* buff;
struct ncclDevComm* comm;
const int p2pNet;
const T** srcs;
T** dsts;
@@ -130,7 +130,7 @@ class ncclPrimitives {
STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
}
if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
if (connPtrsFifoPtr) dsts[DST+index] = (T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS);
else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
step += SLICESTEPS;
}
@@ -148,7 +148,7 @@ class ncclPrimitives {
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
#endif
if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
if (connPtrsFifoPtr) srcs[SRC+index] = (const T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS);
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
step += SLICESTEPS;
}
@@ -197,9 +197,58 @@ class ncclPrimitives {
}
}
// Scatter and gather do not support DIRECT
template <int RECV, int SEND>
inline __device__ void
ScatterGatherOp(const T* srcPtr, T* dstPtr, int totalElem, int peerElem, int skip, int shift) {
int offset = 0; // slice offset
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(peerElem, 16*SLICESPERCHUNK)*16, sliceSize/32); // per-peer slice size
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(dataSize, peerElem-offset));
if (tid < nworkers) {
if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<0, 0>(0);
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<0, 0>(0, realSize*sizeof(T));
subBarrier();
if (SEND) {
#pragma unroll 1
for (int j=0; j<nsend; j++) {
int i = (j+shift)%nsend;
int peerOffset = i*peerElem + offset;
if (skip >=0 && i >= skip) peerOffset += peerElem;
const T* src0 = srcPtr + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nworkers, 1, &src0, 1, dsts+i, realPeerSize);
}
} else if (RECV) {
#pragma unroll 1
for (int j=0; j<nrecv; j++) {
int i = (j+shift)%nrecv;
int peerOffset = i*peerElem + offset;
if (skip >= 0 && i >= skip) peerOffset += peerElem;
T* dst0 = dstPtr + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nworkers, 1, srcs+i, 1, &dst0, realPeerSize);
}
}
}
barrier();
if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
__syncwarp();
if (SEND && (role & ROLE_POST_SEND)) postSend();
if (RECV && (role & ROLE_POST_RECV)) postRecv();
offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pRecv.conn : &channel->devPeers[peer].recv.conn;
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 : (((p2pNet && (NSEND+NRECV) == 1)) ? NCCL_CONN_IDX_P2P_NET : 0);
conn = &channel->devPeers[peer].recv[connIndex].conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_RECV) {
@@ -222,7 +271,9 @@ class ncclPrimitives {
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pSend.conn : &channel->devPeers[peer].send.conn;
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 : (((p2pNet && (NSEND+NRECV) == 1)) ? NCCL_CONN_IDX_P2P_NET : 0);
conn = &channel->devPeers[peer].send[connIndex].conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_SEND) {
@@ -230,11 +281,13 @@ class ncclPrimitives {
}
if (role & ROLE_WAIT_SEND) {
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
// void* volatile* ptr = conn->ptrExchange;
// while ((direct = (T*)(*ptr)) == NULL);
// *ptr = NULL;
//}
#if 0
if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
void* volatile* ptr = conn->ptrExchange;
while ((direct = (T*)(*ptr)) == NULL) { if (checkAbort()) break; }
*ptr = NULL;
}
#endif
connHeadPtr = conn->head;
connHeadCache = LOAD(connHeadPtr);
connSizesFifoPtr = conn->sizesFifo;
@@ -252,8 +305,8 @@ class ncclPrimitives {
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group, int p2p = 0)
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2p(p2p) {
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2pNet(*comm->p2pNet) {
nthreads = nworkers;
// For send operations, we need an extra warp to overlap the threadfence and the copy
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
@@ -347,6 +400,16 @@ class ncclPrimitives {
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ __forceinline__ void
scatter(const T* src, int totalElem, int peerElem, int skip, int shift) {
ScatterGatherOp<0, 1>(src, NULL, totalElem, peerElem, skip, shift);
}
__device__ __forceinline__ void
gather(T* dst, int totalElem, int peerElem, int skip, int shift) {
ScatterGatherOp<1, 0>(NULL, dst, totalElem, peerElem, skip, shift);
}
__device__ __forceinline__ ~ncclPrimitives() {
// Save steps for the next operation
saveSync();
+5 -4
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -228,8 +228,9 @@ class ncclLLPrimitives {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv->conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send->conn, i);
loadRecvSync();
loadSendSync();
}
+4 -4
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -394,8 +394,8 @@ class ncclLL128Primitives {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv->conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send->conn, i);
loadRecvSync();
loadSendSync();
}
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+6 -4
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -49,14 +50,14 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
int nThreadsSplit = nThreads/2;
if ((tid < nThreadsSplit) && recvCount >= 0) {
const int chunkSize = args->p2p.recvChunkSize/sizeof(T);
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
int nt = nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv, 1);
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
if (recvCount == 0) {
prims.recv(recvbuff, 0);
@@ -68,10 +69,11 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
}
}
if ((tid >= nThreadsSplit) && sendCount >= 0) {
const int chunkSize = args->p2p.sendChunkSize/sizeof(T);
int peer = (comm->rank+delta)%comm->nRanks;
int nt = nThreads-nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend, 1);
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
if (sendCount == 0) {
prims.send(sendbuff, 0);
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+371 -128
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -11,6 +11,7 @@
#include "graph/topo.h"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include "gdrwrap.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(func, algo, redop, dtype) \
@@ -65,6 +66,21 @@ static ncclKern_t const ncclKerns[1] = {
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
};
// Determine the maximum kernel stack size of all CUDA kernels
size_t ncclKernMaxLocalSize() {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
hipFuncAttributes attr = {0};
size_t max = 0;
for (int i = 0; i < numNcclKerns; i++) {
CUDACHECKGOTO(hipFuncGetAttributes(&attr, (const void*)(ncclKerns[i])), res, error);
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
}
error:
return (res != ncclSuccess) ? 0 : max;
}
/*****************************************************************************/
/* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/
@@ -108,14 +124,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
return ncclSuccess;
}
static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
ncclComm_t comm = eqInfo->comm;
hipLaunchParams* params = comm->myParams;
// Only launch blocks where we have work to do.
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
// This is not supported when we are in cudaGraph mode.
// Because in cudaGraph mode the launch param needs to be determined
// at capture time instead of launch time.
if (!usingCudaGraph) {
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
eqInfo->maxChannels = params->gridDim.x;
}
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<params->gridDim.x; c++) {
for (int c=0; c<eqInfo->maxChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->workCount == 0) {
struct ncclWork* w;
@@ -126,22 +151,39 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params)
e->p2p.nThreads = 0;
}
STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
{ // [RCCL] Wait for any clique-based collectives
NCCLCHECK(comm->cliqueManager->WaitForPointers());
} // [/RCCL]
if (c == 0) {
// Find the first operation, choose the kernel accordingly and pass it as the first argument.
// Note that changing cuda launch argument after capture is not supported by cudaGraph
struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
if (!usingCudaGraph) {
params->func = (void *)ncclKerns[0];
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
}
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
}
if (channel->gdrMemDesc) {
// GDRCOPY support
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
uint64_t nelems = channel->workCount;
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
channel->workFifo, channel->workFifoGdr, first, nelems);
for (int i = 0; i < nelems; i++) {
int elem = (first+i) % NCCL_MAX_OPS;
// Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping
NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1));
}
}
}
{ // [RCCL] Wait for any clique-based collectives
NCCLCHECK(comm->cliqueManager->WaitForPointers());
} // [/RCCL]
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclChannel* c0 = comm->channels;
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
params->func = (void *)ncclKerns[0];
return ncclSuccess;
}
@@ -184,21 +226,23 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
hipLaunchParams* params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
NCCLCHECK(setupLaunch(comm, params));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
if (comm->launchMode == ncclComm::GROUP &&
(comm->groupCudaStream ||
comm->userStream == hipStreamDefault/* ||
comm->userStream == hipStreamLegacy ||
comm->userStream == hipStreamPerThread*/)) {
// Enqueue event in user stream
CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
CUDACHECK(hipEventRecord(comm->intDoneEvent, comm->userStream));
// Create dependency between user stream and internal NCCL stream
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0));
params->stream = comm->groupStream;
} else {
if (comm->userStream != params->stream) {
if (comm->userStream != params->stream && !comm->usingCudaGraph) {
// Stream changed from last call, create dependency against last NCCL kernel launch
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
@@ -217,7 +261,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
hipLaunchParams *params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
@@ -230,51 +274,80 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
}
hipEvent_t startEvent;
hipEvent_t stopEvent;
if (comm->launchMode == ncclComm::PARALLEL) {
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
} else {
if (comm->launchMode == ncclComm::GROUP) {
NCCLCHECK(ncclCpuBarrierOut(comm));
} else {
CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
}
return ncclSuccess;
}
static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a cudaFree between the CUDA
// launch and the ncclProxyStart call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
uint64_t max = 0ULL;
for (int r=0; r<params->gridDim.x; r++) {
ncclComm_t comm = eqInfo->comm;
if (eqInfo->maxChannels == 0) return ncclSuccess;
for (int r=0; r<eqInfo->maxChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
max = std::max(max, channel->workFifoTail);
channel->workCount = 0;
}
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
struct ncclChannel* channel = comm->channels+r;
channel->workFifoTail = max;
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = max;
comm->lastChannel = 0;
NCCLCHECK(ncclProxyStart(comm));
return ncclSuccess;
}
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
ncclResult_t ncclRecordEvents(ncclComm_t comm) {
hipLaunchParams *params = comm->myParams;
// Enqueue event after NCCL kernel
CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
// Enqueue event after NCCL kernel (only in non-graph mode)
if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
if (comm->launchMode == ncclComm::GROUP &&
(comm->groupCudaStream ||
comm->userStream == hipStreamDefault/* ||
comm->userStream == hipStreamLegacy ||
comm->userStream == hipStreamPerThread*/)) {
CUDACHECK(hipEventRecord(comm->intDoneEvent, params->stream));
// Create dependency between NCCL internal stream and user stream
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0));
}
return ncclSuccess;
}
ncclResult_t ncclLaunchReset(ncclComm_t comm) {
comm->userStreamSet = false;
// We are finishing capture of the current launch
// But we need to keep the current enqueue info for CUDA graph
// Thus we need to creating a new enqueue info for the next run
if (comm->usingCudaGraph) {
NCCLCHECK(ncclCalloc(&comm->enqueueInfo, 1));
comm->enqueueInfo->comm = comm;
} else {
// If not in CUDA graph mode, we reuse the same info space
NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
}
hipLaunchParams *params = comm->myParams;
params->gridDim.x = params->blockDim.x = 0;
params->func = NULL;
// Reset launch mode to GROUP if changed
if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP;
comm->usingCudaGraph = 0;
return ncclSuccess;
}
/*****************************************************************************/
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
RCCL_PARAM(SharpThreshold, "SHARP_THRESHOLD", 16384);
static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
@@ -283,14 +356,13 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
info->algorithm = -1;
info->protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
#define SHARP_COLL_SAT_THRESHOLD 16384
// Check collNet support
int collNetTypeSupport = 0;
if (info->comm->collNetSupport && info->nBytes < SHARP_COLL_SAT_THRESHOLD*comm->collNetnChannels/2)
if (info->comm->collNetSupport > 0 && info->nBytes < rcclParamSharpThreshold())
NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport));
if (collNetTypeSupport != 1) nAlgos--;
for (int a=0; a<nAlgos; a++) {
if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time));
@@ -308,23 +380,37 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
int nc = (info->nChannels > 0) ? info->nChannels :
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->collNetnChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
if (info->algorithm == NCCL_ALGO_COLLNET) {
int ncSwitch = 16;
bool flag = true;
while (ncSwitch >= 1 && flag) {
while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) {
if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
nc--;
}
ncSwitch /= 2;
}
} else {
while (info->nBytes < nc*nt*threadThreshold) {
if (nc >= 2) nc--;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// do not reduce threads count on VEGA
// do not reduce threads count on VEGA
#else
else if ((nt % 128) == 0) nt/=2;
else if ((nt % 128) == 0) nt/=2;
#endif
else break;
else break;
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
if (info->protocol == NCCL_PROTO_SIMPLE) {
nt += WARP_SIZE; // Extra warp for sync
if (info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
}
#endif
info->nChannels = nc;
info->nThreads = nt;
@@ -341,7 +427,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
case ncclFuncAllGather:
info->pattern = ncclPatternRing; break;
case ncclFuncAllReduce:
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
return ncclInternalError;
@@ -356,9 +442,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternTreeUpDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollTreeUp:
case ncclPatternCollTreeDown:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternCollTreeUpDown:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
@@ -378,7 +464,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
work->opCount = info->comm->opCount;
work->op.opCount = info->comm->collOpCount;
work->sendbuff = info->sendbuff;
work->recvbuff = info->recvbuff;
work->coll.root = info->root;
@@ -432,9 +518,10 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*32 && chunkSize > 262144) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth/2 && chunkSize > 16384) chunkSize /= 2;
// Use lastChunkSize as chunkSize
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
@@ -459,20 +546,23 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->chunkSize = chunkSize;
proxyArgs->protocol = info->protocol;
proxyArgs->dtype = info->datatype;
proxyArgs->redOp = info->op;
proxyArgs->redOp = (info->algorithm == NCCL_ALGO_COLLNET) ? info->op : ncclNumOps; // Only set redOp when using CollNet
proxyArgs->pattern = info->pattern;
proxyArgs->root = info->root;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm);
return ncclSuccess;
}
@@ -487,72 +577,100 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
return ncclSuccess;
}
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1) {
// Compute enqueue element, save it in list
// Compute CUDA launch parameters
// Capture time code in view of CUDA graph
static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
ncclComm_t comm = info->comm;
if (comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &work, &proxyArgs));
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
struct ncclWorkElem* work = &eqElem->work;
eqElem->proxyArgs.nsubs = 1;
NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs));
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
// Determine grid size
hipLaunchParams* params = comm->myParams;
params->gridDim.x += info->nChannels;
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
int nChannels = work.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
// Inline the first kernel
if (params->func == NULL) {
params->func = (void *)ncclKerns[0];
memcpy(&comm->args, work, sizeof(struct ncclWorkElem));
comm->args.coll.bid = 0; // Only inline for channel 0
comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode
}
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
return ncclSuccess;
}
// Dynamic enqueue code
static ncclResult_t ncclEnqueueCollKernel(ncclComm_t comm, struct ncclQueueElem* eqElem) {
struct ncclWorkElem* work = &eqElem->work;
struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
int nChannels = work->coll.nChannels;
for (int bid=0; bid<nChannels; bid++) {
int channelId = comm->lastChannel % comm->nChannels;
struct ncclChannel* channel = comm->channels+channelId;
// Proxy
proxyArgs.channel = channel;
// Adjust pattern for CollNet based on channel index
if (nSubChannels == 2) {
info->pattern = (channelId < info->comm->collNetnChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
}
proxyArgs->subs[0].channel = channel;
proxyArgs->opCount = comm->collOpCount;
proxyArgs->commOpCount = comm->opCount;
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
info->comm->myParams->gridDim.x++;
if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
comm->lastChannel++;
// [RCCL] Setup pointers to where all the input/output pointers will be
if (info->protocol == NCCL_PROTO_CLIQUE) {
NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work));
if (proxyArgs->protocol == NCCL_PROTO_CLIQUE) {
NCCLCHECK(comm->cliqueManager->SetCliqueArgs(work));
}
// [/RCCL]
work.coll.bid = bid % nChannels;
NCCLCHECK(getNextOp(channel, NULL, &work));
work->coll.bid = bid % nChannels;
NCCLCHECK(getNextOp(channel, NULL, work));
//INFO(NCCL_COLL, "Host enqueue: bid %d channel %d index %ld nThreads %d funcIndex %d count %ld nChannels %d",
// work->coll.bid, channelId, channel->workFifoTail, work->nThreads, work->funcIndex, work->coll.count, work->coll.nChannels);
}
info->comm->opCount++;
comm->collOpCount++;
return ncclSuccess;
}
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
if (comm->asyncOpCount == 0) {
return ncclSuccess;
} else if (comm->asyncOpCount == 1) {
// No aggregation
struct ncclInfo* info = comm->asyncOps;
info->nChannels = 0;
NCCLCHECK(ncclSaveKernel(info));
NCCLCHECK(ncclSetupCollKernel(info));
} else {
// Aggregation
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
// Reduce the per-channel size if we cannot fully utilize the channels
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
int channelUsed = 0;
for (int c = 0; c < comm->asyncOpCount; c++) {
struct ncclInfo* info = comm->asyncOps+c;
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
NCCLCHECK(ncclSaveKernel(info));
channelUsed += info->nChannels;
NCCLCHECK(ncclSetupCollKernel(info));
}
// If we wrap around on channels, then the inlined op on channel 0 is not the last one on this channel
// Then we need to change active from 2 to 1
if (channelUsed > comm->nChannels) comm->args.active = 1;
}
// Reset counters
comm->asyncOpCount = 0;
@@ -583,8 +701,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pSend.connected :
comm->channels[channelId].peers[peer].send.connected) == 0) {
if (comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
}
@@ -597,8 +714,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pRecv.connected :
comm->channels[channelId].peers[peer].recv.connected ) == 0) {
if (comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
}
@@ -610,59 +726,169 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
return ncclSuccess;
}
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
const int e = (info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME))
? 1 : NCCL_MAX_WORK_ELEMENTS;
for (int s=0; s<e && work->elems[s].p2p.delta != info->delta; s++) {
static int getSegment(int delta, struct ncclWork* work, int e) {
for (int s=0; s<e && work->elems[s].p2p.delta != delta; s++) {
if (work->elems[s].p2p.nThreads == 0) return s;
}
return -1;
}
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
struct ncclWorkElem* elem = work->elems+s;
static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) {
elem->comm = info->comm->devComm;
elem->funcIndex = FUNC_INDEX_P2P;
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
elem->nThreads = NCCL_MAX_NTHREADS;
elem->sendbuff = info->sendbuff;
elem->recvbuff = info->recvbuff;
elem->opCount = info->comm->lastOpCount;
elem->op.opCount = info->comm->collOpCount;
elem->p2p.sendCount = info->sendbytes;
elem->p2p.recvCount = info->recvbytes;
elem->p2p.sendChunkSize = info->sendChunkSize;
elem->p2p.recvChunkSize = info->recvChunkSize;
elem->p2p.delta = info->delta;
return ncclSuccess;
}
static ncclResult_t enqueueP2pOp(struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s) {
// Copy element into corresponding segment of ncclWork
memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
// Determine nThreads at dynamic time
const int nsegments = s+1;
int nThreads = 512;
while (nsegments*nThreads > 256) nThreads /= 2;
//if (nThreads >= 128) nThreads += WARP_SIZE;
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
return ncclSuccess;
}
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
int channelId = info->channelId;
struct ncclChannel* channel = info->comm->channels+channelId;
ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
struct ncclWorkElem* workElem = &eqElem->work;
struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
// Try to reuse last p2p operation if not full yet
struct ncclChannel* channel = proxyArgs->subs[0].channel;
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
int segment = -1;
const int e = ((comm->topo->nodes[GPU].count == comm->topo->nRanks) && (comm->topo->type & RCCL_TOPO_4P2H_ROME))
? 1 : NCCL_MAX_WORK_ELEMENTS;
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
// Try to pack more segments into a single operation
segment = getSegment(info, w);
segment = getSegment(workElem->p2p.delta, w, e);
}
if (segment == -1) {
NCCLCHECK(getNextOp(channel, &w, NULL));
segment = 0;
}
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
NCCLCHECK(saveP2pOp(info, w, segment));
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
// store work element into FIFO
NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
NCCLCHECK(enqueueP2pOp(workElem, w, segment));
comm->collOpCount++;
return ncclSuccess;
}
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
ncclComm* comm = info->comm;
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
int channelId = info->channelId;
hipLaunchParams* params = comm->myParams;
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
// Record the first kernel to launch
// Just for CUDA kernel to know this is a P2P operation
// The CUDA kernel does not use the inlined first work element as fastpath argument
if (params->func == NULL) {
params->func = (void *)ncclKerns[0];
memcpy(&comm->args, &eqElem->work, sizeof(struct ncclWorkElem));
}
return ncclSuccess;
}
template<int USING_CUDA_GRAPH>
void HIPRT_CB ncclEnqueueHostSetup(void* arg) {
ncclResult_t ret;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
ncclComm_t comm = eqInfo->comm;
// Iterate through the element list
struct ncclQueueElem* eqElem = eqInfo->elemList.head;
while (eqElem != eqInfo->elemList.tail) { // The queue always has one extra element
if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
} else {
NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem), ret, cb_end);
}
eqElem = eqElem->next;
}
NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end);
NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end);
cb_end:
if (ret != ncclSuccess) {
WARN("Failure in host setup : %s", ncclGetErrorString(ret));
}
eqInfo->ret = ret;
}
template void HIPRT_CB ncclEnqueueHostSetup<0>(void*);
template void HIPRT_CB ncclEnqueueHostSetup<1>(void*);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
comm->usingCudaGraph = 0;
#if CUDART_VERSION >= 11030
cudaStreamCaptureStatus captureStatus;
unsigned long long cudaGraphId;
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
if (captureStatus == cudaStreamCaptureStatusActive) {
if (cudaGraphId != comm->lastCudaGraphId) {
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
// We are in a new graph, hence need to forget the last setup node so that
// the first setup node in the new graph will not have a dependency
comm->lastCudaGraphId = cudaGraphId;
comm->lastSetupNode = NULL;
}
if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
comm->usingCudaGraph = 1;
}
#endif
return ncclSuccess;
}
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
#if CUDART_VERSION >= 11030
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
// Create a CUDA object to wrap around the argument space
// which CUDA graph would manage lifetime of
cudaUserObject_t object;
CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
// Add a CPU node to the graph
cudaGraphNode_t setupNode;
cudaHostNodeParams setupNodeParams = {fn, eqInfo};
int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
comm->lastSetupNode = setupNode;
return ncclSuccess;
#else
WARN("NCCL does not support this CUDA version for CUDA graph feature");
return ncclInternalError;
#endif
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
// Launch asynchronously if needed
if (ncclAsyncMode()) {
@@ -681,12 +907,12 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
NCCLCHECKGOTO(checkSetStream(info), ret, end);
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
if (info->coll == ncclFuncSendRecv) { //p2p stored separately
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
} else {
@@ -703,13 +929,30 @@ end:
NCCLCHECK(checkSetStream(info));
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
NCCLCHECK(ncclSaveKernel(info));
NCCLCHECK(ncclBarrierEnqueue(info->comm));
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
NCCLCHECK(ncclEnqueueEvents(info->comm));
// Check whether we are in cuda graph mode
cudaGraph_t graph;
ncclComm_t comm = info->comm;
NCCLCHECK(ncclGetCudaGraph(comm, &graph));
// Common part between graph mode and non-graph mode
NCCLCHECK(ncclSetupCollKernel(info));
// Host setup
if (comm->usingCudaGraph) {
NCCLCHECK(ncclCudaGraphHostSetup(comm, graph));
} else {
ncclEnqueueHostSetup<0>(comm->enqueueInfo);
NCCLCHECK(comm->enqueueInfo->ret);
}
// Common part between graph mode and non-graph mode
NCCLCHECK(ncclLaunchBarrier(comm));
NCCLCHECK(ncclLaunchKernel(comm));
NCCLCHECK(ncclRecordEvents(comm));
NCCLCHECK(ncclLaunchReset(comm));
return ncclSuccess;
}
}
+82 -59
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,23 +9,29 @@
#include "graph.h"
#include "trees.h"
#include "rings.h"
#include "topo.h"
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nChannels = comm->nChannels;
for (int c=0; c<comm->nChannels; c++) {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
channel->collTree.out = -1;
channel->collTree.headRank = -1;
channel->collTree.nHeads = 0;
channel->collTree.shift = 0;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
@@ -54,25 +60,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
}
// Duplicate channels rings/trees
struct ncclChannel* channel0 = comm->channels;
struct ncclChannel* channel1 = channel0+comm->nChannels;
memcpy(channel1, channel0, comm->nChannels*sizeof(struct ncclChannel));
// Setup collnet tree
for (int c=0; c<comm->collNetnChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->collTree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
int* collNetIntra = collNetGraph->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (collNetIntra[i] == rank) {
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
channel->collTree.up = collNetIntra[prev];
channel->collTree.down[0] = collNetIntra[next];
}
}
}
struct ncclChannel* channel1 = channel0+nChannels;
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
return ncclSuccess;
}
@@ -176,36 +165,53 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
int nranks = comm->nRanks;
int depth = nranks/comm->nNodes;
int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
for (int c=0; c<comm->collNetnChannels/2; c++) {
struct ncclChannel* channel = comm->channels+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTree.down[0] = -1;
}
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nHeads = collNetGraph->nChannels;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
// Find all head ranks
// Head index is always 0
for (int c=0; c<nHeads; c++) {
int* collNetIntra = collNetGraph->intra+c*localRanks;
heads[c] = collNetIntra[0];
}
int recvIndex = 0; // recv GPU index is always 0
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
for (int c=0; c<comm->collNetnChannels/2; c++) {
struct ncclChannel* channel = comm->channels+comm->collNetnChannels/2+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
channel->collTree.up = nranks;
// For all channels
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
char line[1024];
sprintf(line, "CollNet channel %d rank %d ", c, rank);
int nDown = 0;
for (int i=0; i<nHeads; i++) {
if (rank == heads[i]) { // is head
channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
int* collNetIntra = collNetGraph->intra+i*localRanks;
sprintf(line+strlen(line), "down ");
for (int r=0; r<localRanks; r++) {
if (collNetIntra[r] == rank) continue;
channel->collTree.down[nDown++] = collNetIntra[r]; // connect to all peers
sprintf(line+strlen(line), " %d ", collNetIntra[r]);
}
sprintf(line+strlen(line), "nDown %d ", nDown);
break;
}
}
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTree.down[0] = -1;
// Connect to all heads
int nUp = 0;
sprintf(line+strlen(line), "up ");
for (int h=0; h<nHeads; h++) {
if (rank == heads[h]) continue;
channel->collTree.up[nUp++] = heads[h];
sprintf(line+strlen(line), " %d ", heads[h]);
}
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->collNetnChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
channel->collTree.nHeads = nHeads;
channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
INFO(NCCL_GRAPH, "%s", line);
}
return ncclSuccess;
}
@@ -240,7 +246,18 @@ int ncclMaxNchannels() {
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int nc) {
static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for (c=start; c<end; c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
}
return c;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int nranks = comm->nRanks;
@@ -272,19 +289,25 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
// Get number of channels after duplication
nc *= comm->nChannels;
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
// Setup CollNet
if (comm->collNetSupport == 1) {
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
int c;
for (c=nChannels; c<std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels())); c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
}
nChannels = comm->nChannels = c;
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(nc, ncclMinNchannels()), ringPrev, ringNext);
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+4 -5
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -279,8 +279,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
else p2pLevel = PATH_SYS;
p2pLevel = PATH_PXB;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
@@ -595,6 +594,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror;
}
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetnChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
return ncclSuccess;
}
+79 -44
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -429,9 +429,67 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
float* maxwidths;
int* minhops;
int netcount = 0;
NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
for (int n=0; n<system->nodes[NET].count; n++) {
maxwidths[n] = 0.0;
minhops[n] = 255;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
maxwidths[n] = paths[g].width;
minhops[n] = paths[g].count;
}
}
if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
int index;
for (index = 0; index < netcount; index++) {
if (minhops[n] < minhops[nets[index]]) break;
}
// Insert net at index
// Shift all nets with higher nhops
for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
// Insert this net at index
nets[index] = n;
netcount++;
}
*netcountRet = netcount;
// Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
// 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
for (int start = 0; start < netcount;) {
int end = start+1;
while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
// Shuffle
for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
int netStart = nets[start];
for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
nets[end-1] = netStart;
}
start = end;
}
free(minhops);
free(maxwidths);
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
const int speed = graph->speedInter;
for (int n=0; n<system->nodes[NET].count; n++) {
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netcount;
NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
for (int i=0; i<netcount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoNode* gpu;
if (graph->collNet && net->net.collSupport == 0) continue;
@@ -457,10 +515,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (graph->nChannels == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
struct ncclTopoLinkList* paths = net->paths[GPU];
int f = 0, f_gdr = 0;
// find the first GPU that is closest to NIC
int f = 0;
for (int i = 0; i<system->nodes[GPU].count; i++)
if (paths[i].count < paths[f].count) f = i;
for (int i = 0; i<system->nodes[GPU].count; i++) {
if (paths[i].count <= paths[f].count) {
// prefer GPU direct RDMA
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
f = i;
f_gdr = gdr;
}
}
}
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
if (t == -1) *time = -1;
@@ -504,6 +571,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
}
free(nets);
return ncclSuccess;
}
@@ -700,7 +768,6 @@ RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
graph->speedIntra = graph->speedInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
@@ -729,41 +796,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (graph->nChannels) {
system->type |= RCCL_TOPO_4P2H_ROME;
}
} else if (!rcclParamModelMatchingDisable()) {
} else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
// try to match 8P6L
NCCLCHECK(parseChordalRing(system, graph));
if (graph->nChannels) return ncclSuccess;
// try to match Rome 4P2H
NCCLCHECK(parseRome4P2H(system, graph));
}
if (graph->collNet && graph->nChannels) {
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
int nets[MAXCHANNELS], n = 0;
for (int i = 0; i < tmpGraph.nChannels; i++) {
int j;
for (j = 0; j < n; j++) {
if (nets[j] == tmpGraph.inter[i*2])
break;
}
if (j >= n)
nets[n++] = tmpGraph.inter[i*2];
}
for (int i = 0; i < n; i++) {
int j;
for (j = 0; j < tmpGraph.nChannels; j++) {
if (nets[i] == tmpGraph.inter[j*2])
break;
}
if (j < tmpGraph.nChannels) {
memcpy(graph->intra+i*ngpus, &tmpGraph.intra[j*ngpus], ngpus*sizeof(int));
memcpy(graph->inter+i*2, &tmpGraph.inter[j*2], 2*sizeof(int));
}
}
memcpy(graph->intra+n*ngpus, graph->intra, ngpus*sizeof(int)*n);
memcpy(graph->inter+n*2, graph->inter, 2*sizeof(int)*n);
graph->nChannels = n;
}
if (graph->nChannels) return ncclSuccess;
if ((graph->pattern == NCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
@@ -806,6 +845,7 @@ search:
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[0], graph->inter[1]);
printf("\n");
}
#endif
@@ -897,12 +937,7 @@ done:
graph->nChannels = 1;
}
if (graph->nChannels && graph->collNet) {
// duplicate collnet channels
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, ngpus*sizeof(int)*graph->nChannels);
memcpy(graph->inter+graph->nChannels*2, graph->inter, 2*sizeof(int)*graph->nChannels);
}
else if (graph->speedIntra >= 25.0) {
if (graph->speedIntra >= 25.0) {
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
@@ -951,7 +986,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
@@ -960,7 +995,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
*dev = graph->inter[channel*2+index];
} else {
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
*dev = id;
}
return ncclSuccess;
+74 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -182,6 +182,65 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
return ncclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// NCCL take wrong topology decisions.
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
uint64_t device = pciSwitch->pci.device;
// Only flatten PEX Gen 4 switches in base mode
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for (int l=0; l<pciSwitch->nlinks; l++) {
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if (sub->type != PCI || sub->pci.device != device) continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
struct ncclTopoNode* remNode = sub->links[l].remNode;
if (remNode == pciSwitch) continue;
// Add link from parent PCI switch -> PCI device
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for (int rl=0; rl<remNode->nlinks; rl++) {
if (remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
pciSwitch->pci.device = 0x1000c01010000000;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
@@ -200,6 +259,8 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
@@ -360,6 +421,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, busId));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 32;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 16;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if (str) node->pci.device += strtol(str, NULL, 0);
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
@@ -554,6 +624,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
#endif
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
@@ -681,7 +752,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
}
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
*id = nets[rr % count];
*id = nets[rr%count];
free(nets);
return ncclSuccess;
}
+6 -4
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -29,8 +29,7 @@
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P(speed) (speed*9/12)
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)
#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
@@ -114,6 +113,9 @@ struct ncclTopoNode {
int model;
cpu_set_t affinity;
}cpu;
struct {
uint64_t device;
}pci;
};
int nlinks;
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
+11 -8
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -63,11 +63,11 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.1, 1.1, 3.3 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 1.1, 1.1, 1.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 6.5, 6.5, 14.5 } }
{ /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } }
};
// LL128 max BW (per channel) for the different collectives
@@ -87,8 +87,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
#else
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
#endif
@@ -148,8 +150,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
#endif
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/2.0; // Take into account that GDR read is disabled on both sides
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple
// Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
@@ -158,6 +159,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p];
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
@@ -252,6 +254,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
+22 -6
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -17,10 +17,6 @@
#include "core.h"
#include "nvmlwrap.h"
#include "xml.h"
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#endif
/*******************/
/* XML File Parser */
@@ -482,6 +478,26 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+40 -12
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -127,7 +127,7 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
info.sendbytes = sendbytes;
info.recvbytes = recvbytes;
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
NCCLCHECK(ncclSaveP2pKernel(&info));
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
}
@@ -135,7 +135,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P));
return args;
}
@@ -165,6 +165,7 @@ ncclResult_t ncclGroupEnd() {
int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
ncclResult_t ret = ncclGroupError;
int usingCudaGraphAll = -1;
if (ret != ncclSuccess) goto group_cleanup;
/* Launch async ncclCommInitRank */
@@ -307,34 +308,62 @@ sched_delta:
* prevent some ranks from launching their network threads, which would
* prevent the NCCL call from completing, blocking the cudaFree call.
*/
// Check whether we are in cuda graph mode
cudaGraph_t* graphs;
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
if (usingCudaGraphAll == -1) {
usingCudaGraphAll = comm->usingCudaGraph;
} else if (usingCudaGraphAll != comm->usingCudaGraph) {
WARN("Illegal to have some communicators in graph mode while others not");
ret = ncclInvalidUsage;
goto group_cleanup;
}
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == NULL)
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == hipStreamDefault/* ||
args->coll.comm->userStream == hipStreamPerThread ||
args->coll.comm->userStream == hipStreamLegacy*/)
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
if (usingCudaGraphAll == 1) {
NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
} else {
ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
}
NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == NULL)
if (args->coll.comm->userStream == hipStreamDefault/* ||
args->coll.comm->userStream == hipStreamPerThread ||
args->coll.comm->userStream == hipStreamLegacy*/)
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
}
}
@@ -373,8 +402,7 @@ group_cleanup:
pthread_mutex_unlock(&state->poolMutex);
state->nextOps = NULL;
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
ncclLaunchReset(comm);
}
}
}
+8 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -52,11 +52,16 @@ extern struct allocationTracker allocTracker[];
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
// Need async stream for P2P pre-connect + CUDA Graph
hipStream_t stream;
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(hipStreamSynchronize(stream));
CUDACHECK(hipStreamDestroy(stream));
int dev;
CUDACHECK(hipGetDevice(&dev));
if (dev < MAX_ALLOC_TRACK_NGPU) {
+3 -3
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
+11 -2
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,6 +28,15 @@
} \
} while(false)
// Report failure but clear error and continue
#define CUDACHECKIGNORE(cmd) do { \
hipError_t err = cmd; \
if( err != hipSuccess ) { \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, hipGetErrorString(err)); \
(void) hipGetLastError(); \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
+2 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -81,4 +81,5 @@ DECL_ALL
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define SENDRECV_SLICEFACTOR 1
#endif
+16 -8
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -15,6 +15,9 @@
// [/RCCL]
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
typedef void *cudaGraph_t;
typedef void *cudaGraphNode_t;
#define HIPRT_CB
#else
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@@ -84,16 +87,17 @@ struct ncclComm {
int nNodes;
int localRanks;
enum { GROUP, PARALLEL } launchMode;
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
hipEvent_t intDoneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
uint64_t lastOpCount;
// Collective operation counter
uint64_t collOpCount;
// Channels for collectives
int nChannels;
@@ -101,8 +105,6 @@ struct ncclComm {
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
//Channels for collnet
int collNetnChannels;
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
@@ -157,6 +159,7 @@ struct ncclComm {
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
int lastChannel;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist* p2pSends;
@@ -169,6 +172,11 @@ struct ncclComm {
int rootPid; // Process ID of root
// [/RCCL]
// Store info for cudaGraph
int usingCudaGraph; // Only use it during capture time, not launch time
struct ncclQueueInfo* enqueueInfo;
cudaGraphNode_t lastSetupNode;
unsigned long long lastCudaGraphId;
};
#endif
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+33 -9
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -125,8 +125,9 @@ struct ncclConnInfo {
struct ncclConnector {
int connected;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
void* transportResources;
struct ncclConnInfo conn;
struct ncclComm *comm;
};
@@ -151,11 +152,23 @@ struct ncclTree {
int down[NCCL_MAX_TREE_ARITY];
};
#define NCCL_MAX_DIRECT_ARITY 7
struct ncclDirect {
int depth;
int out;
int nHeads;
int headRank;
int shift;
int up[NCCL_MAX_DIRECT_ARITY];
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_CONNS 3
struct ncclPeer {
struct ncclConnector send;
struct ncclConnector recv;
struct ncclConnector p2pSend;
struct ncclConnector p2pRecv;
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
};
struct ncclDevComm;
@@ -179,7 +192,6 @@ struct ncclWorkElem {
const void * sendbuff;
void * recvbuff;
uint64_t opCount;
// Op-specific fields.
union {
struct {
@@ -192,9 +204,15 @@ struct ncclWorkElem {
struct {
size_t sendCount;
size_t recvCount;
int sendChunkSize;
int recvChunkSize;
int32_t delta;
uint16_t nThreads;
} p2p;
struct {
uint16_t padding[15];
uint16_t opCount;
} op;
// [RCCL] Clique-based arguments
// NOTE: Follows same field structure as coll
// because nChannels is accessed from "coll" struct.
@@ -206,7 +224,7 @@ struct ncclWorkElem {
uint8_t nChannels;
} clique;
// [/RCCL]
uint64_t align[3];
uint64_t align[4];
};
};
struct ncclWork {
@@ -219,7 +237,7 @@ struct ncclChannel {
struct {
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collTree;
struct ncclDirect collTree;
int id;
@@ -241,6 +259,12 @@ struct ncclChannel {
float bw_cumulative;
int bw_count;
#endif
uint16_t index; // Only used by GPU
// GDRCOPY support
struct ncclWork* workFifoGdr;
struct ncclWork* workFifoDev;
void* gdrMemDesc;
};
int data[0x80];
};
+74 -7
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,15 +11,82 @@
#include "group.h"
#include "collectives.h"
size_t ncclKernMaxLocalSize();
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
ncclResult_t ncclLaunchReset(ncclComm_t comm);
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
template<int USING_CUDA_GRAPH>
void HIPRT_CB ncclEnqueueHostSetup(void* arg);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
// Enqueue information (for kernel and proxy) for each operation
struct ncclQueueElem {
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
struct ncclQueueElem* next;
};
// Store enqueue elements in a list
struct ncclQueueElemList {
struct ncclQueueElem* head;
struct ncclQueueElem* tail;
};
// Structure passed to CUDA graph
struct ncclQueueInfo {
ncclComm_t comm;
int maxChannels; // Dynamic version of gridDim
ncclResult_t ret; // Return value of host setup call
struct ncclQueueElemList elemList;
};
// Get next element from enqueue list
static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
if (eqInfo == NULL) return ncclInternalError;
struct ncclQueueElemList* list = &eqInfo->elemList;
if (list->tail != NULL) {
*elemOut = list->tail;
memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
} else {
NCCLCHECK(ncclCalloc(&list->tail, 1));
*elemOut = list->tail;
list->head = list->tail;
}
if (list->tail->next == NULL) {
NCCLCHECK(ncclCalloc(&list->tail->next, 1));
}
list->tail = list->tail->next;
return ncclSuccess;
}
// Reset element queue
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
if (eqInfo == NULL) return ncclInternalError;
eqInfo->maxChannels = 0;
eqInfo->ret = ncclSuccess;
eqInfo->elemList.tail = eqInfo->elemList.head;
return ncclSuccess;
}
// Destroy enqueue info space
// used by both CUDA graph and non CUDA graph
static void ncclDestroyQueueInfo(void* ptr) {
if (ptr == NULL) return;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
struct ncclQueueElem* head = eqInfo->elemList.head;
while (head != NULL) {
struct ncclQueueElem* temp = head;
head = head->next;
free(temp);
}
free(eqInfo);
}
#endif // End include guard
+272
Просмотреть файл
@@ -0,0 +1,272 @@
/*************************************************************************
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GDRWRAP_H_
#define NCCL_GDRWRAP_H_
#include "nccl.h"
#include <stdint.h> // for standard [u]intX_t types
#include <stdio.h>
// These can be used if the GDR library isn't thread safe
#include <pthread.h>
extern pthread_mutex_t gdrLock;
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
#define GDRLOCKCALL(cmd, ret) do { \
GDRLOCK(); \
ret = cmd; \
GDRUNLOCK(); \
} while(false)
#define GDRCHECK(cmd) do { \
int e; \
/* GDRLOCKCALL(cmd, e); */ \
e = cmd; \
if( e != 0 ) { \
WARN("GDRCOPY failure %d", e); \
return ncclSystemError; \
} \
} while(false)
// This is required as the GDR memory is mapped WC
#if !defined(__NVCC__)
#if defined(__PPC__)
static inline void wc_store_fence(void) { asm volatile("sync") ; }
#elif defined(__x86_64__)
#include <immintrin.h>
static inline void wc_store_fence(void) { _mm_sfence(); }
#elif defined(__aarch64__)
#ifdef __cplusplus
#include <atomic>
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
#else
#include <stdatomic.h>
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
#endif
#endif
#endif
//#define GDR_DIRECT 1
#ifdef GDR_DIRECT
// Call the GDR API library code directly rather than via
// dlopen() wrappers
#include <gdrapi.h>
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
GDRCHECK(gdr_unpin_buffer(g, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
GDRCHECK(gdr_get_info(g, handle, info));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
gdr_runtime_get_version(major, minor);
return ncclSuccess;
}
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
gdr_driver_get_version(g, major, minor);
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
return ncclSuccess;
}
#else
// Dynamically handle dependency the GDR API library
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
struct gdr;
typedef struct gdr *gdr_t;
typedef struct gdr_mh_s {
unsigned long h;
} gdr_mh_t;
struct gdr_info {
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped:1;
unsigned wc_mapping:1;
};
typedef struct gdr_info gdr_info_t;
/* End of gdrapi.h */
ncclResult_t wrap_gdr_symbols(void);
gdr_t wrap_gdr_open(void);
ncclResult_t wrap_gdr_close(gdr_t g);
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
#endif // GDR_DIRECT
// Global GDR driver handle
extern gdr_t ncclGdrCopy;
#include "alloc.h"
typedef struct gdr_mem_desc {
void *gdrDevMem;
void *gdrMap;
size_t gdrOffset;
size_t gdrMapSize;
gdr_mh_t gdrMh;
} gdr_mem_desc_t;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
static gdr_t ncclGdrInit() {
return NULL;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
return ncclSuccess;
}
#else
static gdr_t ncclGdrInit() {
int libMajor, libMinor, drvMajor, drvMinor;
gdr_t handle = NULL;
// Dynamically load the GDRAPI library symbols
if (wrap_gdr_symbols() == ncclSuccess) {
handle = wrap_gdr_open();
if (handle != NULL) {
ncclResult_t res;
// Query the version of libgdrapi
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
// Query the version of gdrdrv driver
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
// Only support GDRAPI 2.1 and later
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
goto error;
}
else
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
}
}
return handle;
error:
if (handle != NULL) (void) wrap_gdr_close(handle);
return NULL;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
gdr_info_t info;
size_t mapSize;
gdr_mh_t mh;
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
size_t align = alignedAddr - (uint64_t)devMem;
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
// Will offset ever be non zero ?
ssize_t off = info.va - alignedAddr;
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
md->gdrMap = gdrMap;
md->gdrMapSize = mapSize;
md->gdrOffset = off+align;
md->gdrMh = mh;
*gdrHandle = md;
*ptr = (T *)((char *)gdrMap+off);
if (devPtr) *devPtr = (T *)(devMem+off+align);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(hipFree(md->gdrDevMem));
free(md);
return ncclSuccess;
}
#endif
#endif // End include guard
+5 -6
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,7 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
@@ -93,13 +94,11 @@ struct ncclTopoRanks {
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, int nc);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
#include "info.h"
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+2 -2
Просмотреть файл
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_send() failed with error %s", strerror(ret));
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
return ncclSystemError;
}
return ncclSuccess;
+5 -4
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,8 +19,7 @@ typedef enum {
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollTreeUp,
ncclPatternCollTreeDown
ncclPatternCollTreeUpDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
@@ -50,6 +49,8 @@ struct ncclInfo {
int nchunksPerLoop;
ssize_t sendbytes;
ssize_t recvbytes;
int recvChunkSize;
int sendChunkSize;
uint32_t delta;
int channelId;
};
+10 -5
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -53,11 +53,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
NCCLCHECK(ncclNetAccept(lComm, &rComm));
CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained));
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@@ -66,9 +67,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
}
ncclDebugNoWarn = 0;
CUDACHECK(hipFree(gpuPtr));
cleanup4:
NCCLCHECK(ncclNetCloseRecv(rComm));
cleanup3:
NCCLCHECK(ncclNetCloseSend(sComm));
cleanup2:
NCCLCHECK(ncclNetCloseListen(lComm));
cleanup1:
break;
}
return ncclSuccess;
+1
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+56 -30
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,48 +15,67 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
struct ncclProxyArgs {
proxyProgressFunc_t progress;
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
struct ncclProxySubArgs {
struct ncclChannel* channel;
struct ncclConnector* connector;
size_t sendbytes;
size_t recvbytes;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int protocol;
int segment; // Only for profiling
ncclDataType_t dtype;
ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
ssize_t sendbytes;
ssize_t recvbytes;
int sendChunkSize;
int recvChunkSize;
int delta;
// Internal state
uint64_t base;
uint64_t posted;
uint64_t received; // Only used by recv proxy to wait for flush.
uint64_t received;
uint64_t flushed;
uint64_t transmitted;
uint64_t done;
uint64_t end;
uint64_t hdp_flushed;
void* requests[NCCL_STEPS];
};
struct ncclProxyArgs {
proxyProgressFunc_t progress;
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
int nsubs;
int done;
int sliceSteps;
int chunkSteps;
int chunkSize;
uint64_t opCount;
uint64_t commOpCount;
int protocol;
ncclDataType_t dtype;
ncclRedOp_t redOp;
ncclPattern_t pattern;
int root;
int state;
char* sharedBuff[NCCL_STEPS];
int sharedSize[NCCL_STEPS];
int idle;
uint64_t hdp_flushed;
// Element linking
pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
struct ncclProxyArgs* nextGroup;
struct ncclProxyArgs** proxyAppendPtr;
};
struct ncclProxySharedBuffers {
int nslots;
int slotSize;
char* cudaBuff[2*MAXCHANNELS];
int* cudaUsed[2*MAXCHANNELS];
char* hostBuff[2*MAXCHANNELS];
int* hostUsed[2*MAXCHANNELS];
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
// Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
void* collNetResources;
};
struct ncclProxyPool;
@@ -64,11 +84,16 @@ struct ncclProxyState {
pthread_mutex_t opsMutex;
pthread_mutex_t poolMutex;
bool stop;
struct ncclProxySharedBuffers* sharedBuffs;
struct ncclProxyArgs* ops;
struct ncclProxyArgs* nextOps;
struct ncclProxySharedBuffers sharedBuffs;
struct ncclProxyArgs* ops; // Running operations, used by proxy thread
struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex
struct ncclProxyArgs* postedOpsEnd;
struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled)
struct ncclProxyArgs* nextOpsEnd;
struct ncclProxyArgs* pool;
struct ncclProxyArgs* pool; // Free operations for main thread
struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread
struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex
struct ncclProxyPool* pools;
};
@@ -80,15 +105,16 @@ enum proxyMode {
proxyTo = 2
};
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
#include <unistd.h>
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+7 -4
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -41,7 +41,7 @@ struct ncclConnect {
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -54,7 +54,10 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
#endif
-2
Просмотреть файл
@@ -37,6 +37,4 @@ static long log2i(long n) {
return l;
}
int busIdToCudaDev(int64_t busId);
#endif
+110 -178
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -8,6 +8,7 @@
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "gdrwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
@@ -123,15 +124,31 @@ ncclResult_t initNet() {
return ncclSuccess;
}
// GDRCOPY support: Off by default
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
// GDRCOPY support
gdr_t ncclGdrCopy = NULL;
ncclResult_t initGdrCopy() {
if (ncclParamGdrCopyEnable() == 1) {
ncclGdrCopy = ncclGdrInit();
}
return ncclSuccess;
}
NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static size_t maxLocalSizeBytes = 0;
static ncclResult_t ncclInit() {
if (initialized) return ncclSuccess;
pthread_mutex_lock(&initLock);
if (!initialized) {
initEnv();
initGdrCopy();
maxLocalSizeBytes = ncclKernMaxLocalSize();
NCCLCHECK(initNet());
INFO(NCCL_INIT, "Using network %s", ncclNetName());
initialized = true;
@@ -339,10 +356,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm->doneEvent != NULL)
CUDACHECK(hipEventDestroy(comm->doneEvent));
if (comm->intDoneEvent != NULL)
CUDACHECK(hipEventDestroy(comm->intDoneEvent));
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(hipStreamDestroy(comm->groupStream));
}
ncclDestroyQueueInfo(comm->enqueueInfo);
// Last rank frees shared resources between threads
int isLast;
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
@@ -380,6 +402,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
// the device we're on (failure cause #1) , better know it early.
hipEvent_t doneEvent;
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
hipEvent_t intDoneEvent;
CUDACHECK(hipEventCreateWithFlags(&intDoneEvent, hipEventDisableTiming));
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
@@ -391,6 +415,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
comm->doneEvent = doneEvent;
comm->intDoneEvent = intDoneEvent;
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
comm->groupCudaStream = ncclParamGroupCudaStream();
@@ -429,6 +454,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
NCCLCHECK(ncclCalloc(&comm->enqueueInfo, 1));
comm->enqueueInfo->comm = comm;
comm->lastSetupNode = NULL;
comm->lastCudaGraphId = -1;
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
@@ -567,11 +597,11 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
int cgMdLaunch = 1;
// Set CG Mode
comm->launchMode = ncclComm::GROUP;
comm->launchMode = ncclComm::PARALLEL;
char* str = getenv("NCCL_LAUNCH_MODE");
if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str);
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
comm->launchMode = ncclComm::PARALLEL;
if (str && strcmp(str, "GROUP") == 0) {
comm->launchMode = ncclComm::GROUP;
}
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
@@ -619,128 +649,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
return ncclSuccess;
}
extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// type: 0 for send, 1 for recv
// return: 0 - unsupported, 1 - supported
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
int rankInCollNet = -1;
int supported = 0;
int isMaster = (rank == masterRank) ? 1 : 0;
struct {
int collNetRank;
ncclConnect connect;
} sendrecvExchange;
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
peerInfo->rank = nranks;
int ret = 1;
if (isMaster) {
NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
}
// send master receives connect info from peer recv master
if (isMaster && type == 0) {
NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
rankInCollNet = sendrecvExchange.collNetRank;
INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
// select
struct ncclPeer* root = channel->peers+nranks;
struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send;
struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster && ret > 0) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
}
// prepare connect handles
ncclResult_t res;
struct {
int isMaster;
ncclConnect connect;
} *allConnects = NULL;
ncclConnect *masterConnects = NULL;
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == 1) { // recv side: AllGather
// all ranks must participate
NCCLCHECK(ncclCalloc(&allConnects, nranks));
allConnects[rank].isMaster = isMaster;
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
// consolidate
int c = 0;
for (int r = 0; r < nranks; r++) {
if (allConnects[r].isMaster) {
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
if (r == rank) rankInCollNet = c;
c++;
}
}
} else { // send side : copy in connect info received from peer recv master
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
if (isMaster && ret > 0) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == 1) {
sendrecvExchange.collNetRank = rankInCollNet;
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (ret > 0) {
supported = 1;
}
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
return supported;
}
static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) {
int nranks = comm->nRanks;
// AllGather collNet setup results
int* allGatherFailures;
NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
allGatherFailures[rank] = collNetSetupFail;
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
for (int i=0; i<nranks; i++) {
if (allGatherFailures[i] != 0) {
collNetSetupFail = 1;
break;
}
}
free(allGatherFailures);
if (collNetSetupFail) {
if (rank == 0) WARN("Cannot initialize CollNet, using %s instead", ncclNetName());
// Free collNet resources
for (int r=0; r<comm->collNetnChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+nranks;
if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
peer->send.transportResources = NULL; // avoid double free
peer->recv.transportResources = NULL; // avoid double free
}
// Set support to 0
comm->collNetSupport = 0;
} else {
comm->collNetSupport = 1;
}
return ncclSuccess;
}
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
@@ -857,7 +765,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
collNetGraph.collNet = 1;
collNetGraph.crossNic = ncclParamCrossNic();
collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
collNetGraph.minChannels = 1;
collNetGraph.maxChannels = ringGraph.nChannels;
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
@@ -910,6 +819,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
}
// Determine CollNet support
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
if (intraRanks > 8) {
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
comm->collNetSupport = 0;
}
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
if (rcclParamP2pNetDisable() == 0) {
STORE(comm->p2pNet, 1);
@@ -921,6 +837,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// AllGather3 - begin
struct ncclGraphInfo {
int pattern;
int nChannels;
int sameChannels;
float speedIntra;
float speedInter;
@@ -929,9 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
};
struct {
int cudaCompCap;
int fullCudaCompCap;
int nChannels;
int collNetSupport;
int nc;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
@@ -942,39 +857,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].nc = comm->nChannels*2;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4);
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
allGather3Data[rank].nc = comm->nChannels*4;
if (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count && (comm->topo->type & RCCL_TOPO_4P2H_ROME))
allGather3Data[rank].nc = (comm->topo->nodes[NET].count > 3 ? 2 : 4)*comm->topo->nodes[NET].count;
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = comm->nChannels*6;
allGather3Data[rank].nc = 6;
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
allGather3Data[rank].ring.pattern = ringGraph.pattern;
allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
allGather3Data[rank].collNetSupport = comm->collNetSupport;
// CollNet channels are already duplicated
comm->collNetnChannels = 2*collNetGraph.nChannels;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
@@ -1005,24 +918,30 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
allTopoRanks[i] = &allGather3Data[i].topoRanks;
nc = std::min(allGather3Data[i].nc, nc);
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
}
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
(comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
@@ -1031,15 +950,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, nc));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
} else {
comm->collNetnChannels = 0;
}
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc));
free(allTopoRanks);
free(nodesTreePatterns);
@@ -1076,46 +987,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all rings");
// Connect Trees
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all trees");
// Check if we can setup CollNet
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
for (int c=comm->nChannels; c<comm->collNetnChannels; c++)
NCCLCHECK(initChannel(comm, c));;
int logicChannels = comm->collNetnChannels/2;
if (comm->collNetSupport > 0) {
int collNetSetupFail = 0;
const int recvIndex = 0; // recv GPU index is always 0
const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
for (int c=0; c<logicChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
collNetSetupFail = 1;
else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
collNetSetupFail = 1;
// Find all head ranks
int nHeads = collNetGraph.nChannels;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
// Head GPU index is always 0
for (int c=0; c<nHeads; c++) {
heads[c] = collNetGraph.intra[c*comm->localRanks+0];
}
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int h=0; h<nHeads; h++) {
const int head = heads[h];
if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv) != 1)
collNetSetupFail = 1;
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
collNetSetupFail = 1;
}
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
// Verify CollNet setup across ranks
NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
if (comm->collNetSupport) {
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
}
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(rings);
@@ -1140,10 +1063,18 @@ affinity_restore:
return ncclSuccess;
}
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
ncclResult_t res;
CUDACHECK(hipSetDevice(cudaDev));
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
//if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
// TRACE(NCCL_INIT, "Setting hipLimitStackSize to %zi", maxLocalSizeBytes);
// CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes));
//}
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
@@ -1184,6 +1115,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
} else {
NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end);
}
end:
if (ncclAsyncMode()) return ncclAsyncErrCheck(res);
else return res;
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+246
Просмотреть файл
@@ -0,0 +1,246 @@
/*************************************************************************
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "gdrwrap.h"
#ifndef GDR_DIRECT
#include "core.h"
static enum { gdrUninitialized, gdrInitializing, gdrInitialized, gdrError } gdrState = gdrUninitialized;
/* Function pointers assigned from dlopen() */
static gdr_t (*gdr_internal_open)(void);
static int (*gdr_internal_close)(gdr_t g);
static int (*gdr_internal_pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
static int (*gdr_internal_unpin_buffer)(gdr_t g, gdr_mh_t handle);
static int (*gdr_internal_get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
static int (*gdr_internal_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size);
static int (*gdr_internal_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size);
static void (*gdr_internal_runtime_get_version)(int *major, int *minor);
static void (*gdr_internal_driver_get_version)(gdr_t g, int *major, int *minor);
static int (*gdr_internal_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
static int (*gdr_internal_copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
// Used to make the GDR library calls thread safe
pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;
#define GDRAPI_LIBNAME "libgdrapi.so"
#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlsym(handle, symbol); \
if (tmp == NULL) { \
WARN("dlsym failed on %s - %s", symbol, dlerror());\
goto teardown; \
} \
*cast = tmp; \
} while (0)
#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
cast = (void**)&funcptr; \
tmp = dlsym(handle, symbol); \
if (tmp == NULL) { \
INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
} \
*cast = tmp; \
} while (0)
ncclResult_t wrap_gdr_symbols(void) {
if (gdrState == gdrInitialized)
return ncclSuccess;
if (gdrState == gdrError)
return ncclSystemError;
if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
// Another thread raced in front of us. Wait for it to be done.
while (gdrState == gdrInitializing) pthread_yield();
return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
}
static void* gdrhandle = NULL;
void* tmp;
void** cast;
gdrhandle=dlopen(GDRAPI_LIBNAME, RTLD_NOW);
if (!gdrhandle) {
WARN("Failed to open %s", GDRAPI_LIBNAME);
goto teardown;
}
/* Load the function pointers from the DL library image */
LOAD_SYM(gdrhandle, "gdr_open", gdr_internal_open);
LOAD_SYM(gdrhandle, "gdr_close", gdr_internal_close);
LOAD_SYM(gdrhandle, "gdr_pin_buffer", gdr_internal_pin_buffer);
LOAD_SYM(gdrhandle, "gdr_unpin_buffer", gdr_internal_unpin_buffer);
LOAD_SYM(gdrhandle, "gdr_get_info", gdr_internal_get_info);
LOAD_SYM(gdrhandle, "gdr_map", gdr_internal_map);
LOAD_SYM(gdrhandle, "gdr_unmap", gdr_internal_unmap);
LOAD_SYM(gdrhandle, "gdr_runtime_get_version", gdr_internal_runtime_get_version);
LOAD_SYM(gdrhandle, "gdr_driver_get_version", gdr_internal_driver_get_version);
LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping);
LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping);
gdrState = gdrInitialized;
return ncclSuccess;
teardown:
gdr_internal_open = NULL;
gdr_internal_close = NULL;
gdr_internal_pin_buffer = NULL;
gdr_internal_unpin_buffer = NULL;
gdr_internal_get_info = NULL;
gdr_internal_map = NULL;
gdr_internal_unmap = NULL;
gdr_internal_runtime_get_version = NULL;
gdr_internal_driver_get_version = NULL;
gdr_internal_copy_to_mapping = NULL;
gdr_internal_copy_from_mapping = NULL;
if (gdrhandle != NULL) dlclose(gdrhandle);
gdrState = gdrError;
return ncclSystemError;
}
gdr_t wrap_gdr_open(void) {
if (gdr_internal_open == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return NULL;
}
return gdr_internal_open();
}
ncclResult_t wrap_gdr_close(gdr_t g) {
if (gdr_internal_close == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret = gdr_internal_close(g);
if (ret != 0) {
WARN("gdr_close() failed: %d", ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
if (gdr_internal_pin_buffer == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
if (ret != 0) {
WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
if (gdr_internal_unpin_buffer == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_unpin_buffer(g, handle), ret);
if (ret != 0) {
WARN("gdr_unpin_buffer(handle %lx) failed: %d", handle.h, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
if (gdr_internal_get_info == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_get_info(g, handle, info), ret);
if (ret != 0) {
WARN("gdr_get_info(handle %lx) failed: %d", handle.h, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
if (gdr_internal_map == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret);
if (ret != 0) {
WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
if (gdr_internal_unmap == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret);
if (ret != 0) {
WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor) {
if (gdr_internal_runtime_get_version == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
gdr_internal_runtime_get_version(major, minor);
return ncclSuccess;
}
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
if (gdr_internal_driver_get_version == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
gdr_internal_driver_get_version(g, major, minor);
return ncclSuccess;
}
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
if (gdr_internal_copy_to_mapping == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret);
if (ret != 0) {
WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
if (gdr_internal_copy_from_mapping == NULL) {
WARN("GDRCOPY lib wrapper not initialized.");
return ncclInternalError;
}
int ret;
GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret);
if (ret != 0) {
WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
}
#endif /* !GDR_DIRECT */
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+3 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -17,7 +17,7 @@
#define NCCL_SUFFIX "${NCCL_SUFFIX}"
#define NCCL_VERSION_CODE ${NCCL_VERSION}
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#define NCCL_VERSION(X,Y,Z) (((X) >= 2 && (Y) >= 9) ? (X) * 10000 + (Y) * 100 + (Z) : (X) * 1000 + (Y) * 100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
+266 -257
Просмотреть файл
@@ -1,15 +1,15 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "info.h"
#include "graph.h"
#include "collectives.h"
enum { proxyRecv=0, proxySend=1, p2pProxyRecv=2, p2pProxySend=3 };
enum { proxyRecv=0, proxySend=1 };
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -34,26 +34,32 @@ struct ncclProxyPool {
static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs* elem;
pthread_mutex_lock(&state->poolMutex);
if (state->pool == NULL) {
// Allocate a new pool of elements
struct ncclProxyPool* newPool;
NCCLCHECK(ncclCalloc(&newPool, 1));
struct ncclProxyArgs* newElems = newPool->elems;
// Chain newly allocated elements
for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
// Check whether there are freed elements
if (state->poolReturned) {
pthread_mutex_lock(&state->poolMutex);
state->pool = state->poolReturned;
state->poolReturned = NULL;
pthread_mutex_unlock(&state->poolMutex);
} else {
// Allocate a new pool of elements
struct ncclProxyPool* newPool;
NCCLCHECK(ncclCalloc(&newPool, 1));
struct ncclProxyArgs* newElems = newPool->elems;
// Chain newly allocated elements
for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
}
// Add them all to the pool list
state->pool = newElems;
// Save the pool memory block for later resource release
newPool->next = state->pools;
state->pools = newPool;
}
// Add them all to the pool list
state->pool = newElems;
// Save the pool memory block for later resource release
newPool->next = state->pools;
state->pools = newPool;
}
elem = state->pool;
state->pool = state->pool->next;
pthread_mutex_unlock(&state->poolMutex);
elem->next = elem->nextPeer = elem->nextGroup = NULL;
elem->next = elem->nextPeer = NULL;
*argsptr = elem;
return ncclSuccess;
}
@@ -75,23 +81,18 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
WARN("Active list loop at element %ld", OP_INDEX(op));
}
op->idle |= OP_SEEN;
printf("[%ld]", OP_INDEX(op));
printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs);
if (op->nextPeer) {
printf("(%ld)", OP_INDEX(op->nextPeer));
struct ncclProxyArgs* n = op->nextPeer;
n->idle |= OP_SEEN;
while (n->nextGroup || n->nextPeer) {
n = n->nextGroup ? n->nextGroup : n->nextPeer;
while (n->nextPeer) {
n = n->nextPeer;
n->idle |= OP_SEEN;
}
}
if (op->nextGroup) {
printf("--G->");
op = op->nextGroup;
} else {
printf("--N->");
op = op->next;
}
printf("->");
op = op->next;
}
printf("[X]\n");
@@ -128,46 +129,62 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
return ncclSuccess;
}
static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) {
static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) {
struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
int shared = args->subs[0].connector->conn.shared;
if (proxyAppend) {
if (shared && proxyAppend->opCount == args->opCount) {
if ((proxyAppend->sliceSteps != args->sliceSteps) ||
(proxyAppend->chunkSteps != args->chunkSteps) ||
(proxyAppend->protocol != args->protocol) ||
(proxyAppend->dtype != args->dtype) ||
(proxyAppend->redOp != args->redOp)) {
WARN("Proxy append mismatch");
return ncclInternalError;
}
if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) {
WARN("Proxy append out of bound");
return ncclInternalError;
}
memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs));
proxyAppend->nsubs++;
args->next = proxyAppend->next;
proxyAppend->next = NULL;
proxyAppend->nextGroup = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next));
// Free args as we merged them
args->next = state->poolFreed;
state->poolFreed = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
} else {
proxyAppend->nextPeer = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
*(args->proxyAppendPtr) = args;
}
} else {
// Nothing running for that peer. Add to the list
if (state->ops == NULL) {
// Create the list
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element : \n", OP_INDEX(args), shared, args->opCount);
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
state->ops = args;
} else {
// Append element at the end of the list
struct ncclProxyArgs* last = state->ops;
while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next;
while (last->next) last = last->next;
last->next = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element : \n", OP_INDEX(args),shared, args->opCount);
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount);
}
*(args->proxyAppendPtr) = args;
}
*(args->proxyAppendPtr) = args;
return ncclSuccess;
}
static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) {
if (peer < 0) return ncclSuccess;
struct ncclPeer* peerComm = args->channel->peers+peer;
struct ncclConnector* connector = type < p2pProxyRecv ? (type == proxyRecv ? &peerComm->recv : &peerComm->send)
: (type == p2pProxyRecv ? &peerComm->p2pRecv : &peerComm->p2pSend);
struct ncclChannel* channel = args->subs[0].channel;
struct ncclPeer* peerComm = channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
if (connector->transportComm == NULL) {
WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
type < p2pProxyRecv ? (type == proxyRecv ? "recv" : "send") : (type == p2pProxyRecv ? "p2pRecv" : "p2pSend"),
peer, args->channel->id);
WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank,
type == proxyRecv ? "recv" : "send", peer, channel->id);
return ncclInternalError;
}
if (connector->transportComm->proxy == NULL) return ncclSuccess;
@@ -176,14 +193,10 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
struct ncclProxyArgs* op;
NCCLCHECK(allocateArgs(connector->comm, &op));
memcpy(op, args, sizeof(struct ncclProxyArgs));
op->connector = connector;
op->subs[0].connector = connector;
op->progress = connector->transportComm->proxy;
op->state = ncclProxyOpReady;
op->proxyAppendPtr =
connector->conn.shared ?
state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers
&connector->proxyAppend; // Dedicated buffers
op->proxyAppendPtr = connector->proxyAppendPtr;
if (state->nextOps == NULL) state->nextOps = op;
else state->nextOpsEnd->next = op;
@@ -191,120 +204,131 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
return ncclSuccess;
}
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) {
struct ncclChannel* channel = args->subs[0].channel;
int pattern = args->pattern;
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
struct ncclRing* ring = &args->channel->ring;
if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args));
if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args));
struct ncclRing* ring = &channel->ring;
if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, 0));
if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, 0));
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
struct ncclTree* tree = &args->channel->tree;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args));
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
struct ncclTree* tree = &channel->tree;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args, 0));
NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
struct ncclTree* tree = &args->channel->tree;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args));
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
struct ncclTree* tree = &channel->tree;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0));
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0));
}
if (pattern == ncclPatternCollTreeUp) {
if (pattern == ncclPatternCollTreeUpDown) {
// CollTree up
struct ncclTree* tree = &args->channel->collTree;
NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args));
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
}
if (pattern == ncclPatternCollTreeDown) {
NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1)); // For CollTree up, we are using push
// CollTree down
struct ncclTree* tree = &args->channel->collTree;
NCCLCHECK(SaveProxy(proxySend, tree->down[0], args));
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0));
}
return ncclSuccess;
}
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) {
struct ncclProxyArgs args;
memset(&args, 0, sizeof(struct ncclProxyArgs));
args.channel = channel;
args.sliceSteps = 1;
args.chunkSteps = 1;
args.protocol = NCCL_PROTO_SIMPLE;
args.segment = segment;
args.opCount = channel->workFifoTail-1;
args.dtype = info->datatype;
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) {
memset(args, 0, sizeof(struct ncclProxyArgs));
int channelId = info->channelId;
args->nsubs = 1;
struct ncclProxySubArgs* sub = args->subs;
struct ncclChannel* channel = info->comm->channels+channelId;
sub->channel = channel;
args->sliceSteps = 1;
args->chunkSteps = 1;
args->protocol = NCCL_PROTO_SIMPLE;
args->dtype = info->datatype;
sub->delta = info->delta;
sub->recvbytes = info->recvbytes;
sub->sendbytes = info->sendbytes;
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
info->recvChunkSize = stepSize;
info->sendChunkSize = stepSize;
if (info->delta > 0 && info->recvbytes >= 0) {
int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
if (args.nsteps == 0) args.nsteps = 1;
args.recvbytes = info->recvbytes;
args.sendbytes = 0;
NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxyRecv : proxyRecv, peerrecv, &args));
if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) {
// Tune chunk size for the network
if (info->recvbytes < stepSize) info->recvChunkSize /= 4;
else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2;
}
sub->recvChunkSize = info->recvChunkSize;
}
if (info->delta > 0 && info->sendbytes >= 0) {
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
if (args.nsteps == 0) args.nsteps = 1;
args.sendbytes = info->sendbytes;
args.recvbytes = 0;
NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxySend : proxySend, peersend, &args));
if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) {
// Tune chunk size for the network
if (info->sendbytes < stepSize) info->sendChunkSize /= 4;
else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2;
}
sub->sendChunkSize = info->sendChunkSize;
}
return ncclSuccess;
}
static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) {
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) {
struct ncclProxySubArgs* sub = args->subs;
struct ncclChannel* channel = sub->channel;
args->opCount = channel->workFifoTail-1;
args->commOpCount = comm->opCount;
const ssize_t recvbytesOrig = sub->recvbytes;
const ssize_t sendbytesOrig = sub->sendbytes;
if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) {
int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks;
sub->recvbytes = recvbytesOrig;
sub->sendbytes = 0;
sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
if (sub->nsteps == 0) sub->nsteps = 1;
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, NCCL_CONN_IDX_P2P));
}
if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
int peersend = (comm->rank+sub->delta)%comm->nRanks;
sub->sendbytes = sendbytesOrig;
sub->recvbytes = 0;
sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
if (sub->nsteps == 0) sub->nsteps = 1;
NCCLCHECK(SaveProxy(proxySend, peersend, args, NCCL_CONN_IDX_P2P));
}
// Reset proxy args for potentially multiple cuda graph launches
// It is safe as long as SaveProxy copies contents of args to op
sub->recvbytes = recvbytesOrig;
sub->sendbytes = sendbytesOrig;
return ncclSuccess;
}
static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
struct ncclProxyArgs* freeOp = *opPtr;
DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup));
if (*prevGroupPtr && *prevOpPtr) return ncclInternalError;
if (freeOp->nextGroup) {
// Part of a group : remove the element
struct ncclProxyArgs* next = freeOp->nextGroup;
*opPtr = next;
if (*prevGroupPtr) {
(*prevGroupPtr)->nextGroup = next;
} else if (*prevOpPtr) {
DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next));
struct ncclProxyArgs* next = freeOp->next;
*opPtr = next;
if (freeOp->nextPeer) {
// replace op by nextPeer
struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
if (*prevOpPtr) {
(*prevOpPtr)->next = nextPeer;
} else {
state->ops = nextPeer;
}
nextPeer->next = next;
*(prevOpPtr) = nextPeer;
} else {
*(freeOp->proxyAppendPtr) = NULL;
if (*prevOpPtr) {
(*prevOpPtr)->next = next;
} else {
state->ops = next;
}
} else {
struct ncclProxyArgs* next = freeOp->next;
*opPtr = next;
if ((*prevGroupPtr)) {
(*prevGroupPtr)->next = next;
(*prevGroupPtr)->nextGroup = NULL;
(*prevGroupPtr)->nextPeer = freeOp->nextPeer;
if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr;
(*prevOpPtr) = *prevGroupPtr;
(*prevGroupPtr) = NULL;
} else {
if (freeOp->nextPeer) {
// replace op by nextPeer
struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
if (*prevOpPtr) {
(*prevOpPtr)->next = nextPeer;
} else {
state->ops = nextPeer;
}
struct ncclProxyArgs* lastGroup = nextPeer;
while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup;
lastGroup->next = next;
*(prevOpPtr) = lastGroup;
} else {
*(freeOp->proxyAppendPtr) = NULL;
if (*prevOpPtr) {
(*prevOpPtr)->next = next;
} else {
state->ops = next;
}
}
}
}
pthread_mutex_lock(&state->poolMutex);
freeOp->next = state->pool;
state->pool = freeOp;
pthread_mutex_unlock(&state->poolMutex);
freeOp->next = state->poolFreed;
state->poolFreed = freeOp;
DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
NCCLCHECK(dumpProxyState(state));
return ncclSuccess;
@@ -312,33 +336,81 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
struct ncclProxyArgs* prevOp = NULL;
struct ncclProxyArgs* prevGroup = NULL;
struct ncclProxyArgs* op = *opsPtr;
while (op) {
if (op->state == ncclProxyOpNone) return ncclInternalError;
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
// yet and might be cancelled before they even start. Hold on on those.
if (op->opCount < comm->lastOpCount) {
NCCLCHECK(op->progress(op));
*idle &= op->idle;
}
NCCLCHECK(op->progress(op));
*idle &= op->idle;
if (op->state == ncclProxyOpNone) {
NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup));
NCCLCHECK(removeOp(state, &op, &prevOp));
} else {
if (op->nextGroup) {
prevGroup = op;
prevOp = NULL;
op = op->nextGroup;
} else {
prevOp = op;
prevGroup = NULL;
op = op->next;
}
prevOp = op;
op = op->next;
}
}
return ncclSuccess;
}
ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) {
// Return any freed element first
if (state->poolFreed) {
struct ncclProxyArgs* end = state->poolFreed;
while (end->next) end = end->next;
pthread_mutex_lock(&state->poolMutex);
end->next = state->poolReturned;
state->poolReturned = state->poolFreed;
pthread_mutex_unlock(&state->poolMutex);
state->poolFreed = NULL;
}
// Then wait until we have new work to do
pthread_mutex_lock(&state->opsMutex);
while (state->postedOps == NULL) {
if (state->stop) return ncclSuccess;
pthread_cond_wait(&state->cond, &state->opsMutex);
}
// Sort operations as we append them : collectives and
// receives first, then sends.
struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps;
int commOpCount = op->commOpCount;
while (op && op->commOpCount == commOpCount) {
next = op->next;
if (op->subs[0].sendbytes) {
if (prev) prev->next = next;
else state->postedOps = next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op));
} else prev = op;
op = next;
}
op = state->postedOps;
while (op && op->commOpCount == commOpCount) {
next = op->next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op));
op = next;
}
state->postedOps = op;
if (op == NULL) state->postedOpsEnd = NULL;
NCCLCHECK(dumpProxyState(state));
pthread_mutex_unlock(&state->opsMutex);
if (state->poolFreed) {
struct ncclProxyArgs* end = state->poolFreed;
while (end->next) end = end->next;
pthread_mutex_lock(&state->poolMutex);
end->next = state->poolReturned;
state->poolReturned = state->poolFreed;
pthread_mutex_unlock(&state->poolMutex);
state->poolFreed = NULL;
}
return ncclSuccess;
}
void* persistentThread(void *comm_) {
struct ncclComm* comm = (struct ncclComm*)comm_;
struct ncclProxyState* state = &comm->proxyState;
@@ -346,158 +418,95 @@ void* persistentThread(void *comm_) {
sprintf(threadName, "NCCLproxy %5d", comm->rank);
nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
pthread_mutex_lock(&state->opsMutex);
struct ncclProxyArgs** opsPtr = &state->ops;
while (1) {
if (LOAD(comm->abortFlag)) {
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
while (LOAD(opsPtr) == NULL) {
if (state->stop) {
// No more commands to process and proxy has been requested to stop
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
pthread_cond_wait(&state->cond, &state->opsMutex);
ncclResult_t ret = ncclProxyAppendPosted(state);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
return NULL;
}
}
int idle = 1;
ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
if (idle) {
pthread_mutex_unlock(&state->opsMutex);
sched_yield(); // No request progressed. Let others run.
pthread_mutex_lock(&state->opsMutex);
}
}
}
ncclResult_t ncclProxyStart(struct ncclComm* comm) {
struct ncclProxyState* state = &comm->proxyState;
if (state->nextOps == NULL) return ncclSuccess;
pthread_mutex_lock(&state->opsMutex);
// Sort operations as we append them : collectives and
// receives first, then sends.
ncclProxyArgs* next, *prev = NULL, *op = state->nextOps;
while (op) {
next = op->next;
if (op->sendbytes) {
if (prev) prev->next = next;
else state->nextOps = next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
} else prev = op;
op = next;
}
op = state->nextOps;
while (op) {
next = op->next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
op = next;
}
if (state->postedOps) state->postedOpsEnd->next = state->nextOps;
else state->postedOps = state->nextOps;
state->postedOpsEnd = state->nextOpsEnd;
state->nextOps = state->nextOpsEnd = NULL;
NCCLCHECK(dumpProxyState(state));
if (state->ops != NULL)
pthread_cond_signal(&state->cond);
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->opsMutex);
comm->opCount++;
return ncclSuccess;
}
NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
if (state == NULL) {
NCCLCHECK(ncclCalloc(&state, 1));
comm->proxyState.sharedBuffs = state;
state->nslots = ncclParamProxySharedBuffersCount();
if (state->nslots == -2) {
state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS;
}
state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
if (state->size == 0) {
int p2pnChannels = 1;
while (p2pnChannels < comm->nChannels) p2pnChannels *= 2;
int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
state->size = std::max(p2pSize, collNetSize);
}
char* buff;
int* used;
*size = 2*std::max(comm->nChannels, comm->p2pnChannels)*state->slotSize*state->nslots;
*size = state->size;
if (cuda && state->cudaBuff[0] == NULL) {
NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda));
NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots));
for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) {
state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
state->cudaUsed[i] = used + state->nslots*i;
}
} else if (state->hostBuff[0] == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots));
for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) {
state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
state->hostUsed[i] = used + state->nslots*i;
}
if (cuda && state->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda));
} else if (state->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
}
buff = cuda ? state->cudaBuff[0] : state->hostBuff[0];
*ptr = buff;
*ptr = cuda ? state->cudaBuff : state->hostBuff;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
// Use different pools for different channels and also separate send/recv.
int p = 2*channel+type;
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
if (buff == NULL) return ncclInternalError;
int nslots = 1;
while (nslots*state->slotSize < size) nslots *= 2;
for (int s=0; s<state->nslots; s+=nslots) {
int u = 0;
for (int i=0; i<nslots; i++) u += used[s+i];
if (u == 0) {
for (int i=0; i<nslots; i++) used[s+i] = 1;
*ptr = buff+state->slotSize*s;
return ncclSuccess;
}
}
*ptr = NULL;
ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) {
struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
// Use different pools for separate send/recv.
char* buff = cuda ? state->cudaBuff : state->hostBuff;
int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index;
*ptr = buff + slotSize * globalSlot;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
int p = 2*channel+type;
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
if (buff == NULL) return ncclInternalError;
int nslots = 1;
while (nslots*state->slotSize < size) nslots *= 2;
int s = (ptr-buff)/state->slotSize;
if (s < 0 || s+nslots > state->nslots) {
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
return ncclInternalError;
}
for (int i=0; i<nslots; i++) used[s+i] = 0;
ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) {
struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
// Use different pools for different channels.
char* buff = cuda ? state->cudaBuff : state->hostBuff;
int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
*ptr = buff + slotSize * globalSlot;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
if (state) {
CUDACHECK(hipFree(state->cudaBuff[0]));
free(state->cudaUsed[0]);
NCCLCHECK(ncclCudaHostFree(state->hostBuff[0]));
free(state->hostUsed[0]);
free(state);
}
struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
CUDACHECK(hipFree(state->cudaBuff));
NCCLCHECK(ncclCudaHostFree(state->hostBuff));
return ncclSuccess;
}
+159 -71
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -31,33 +31,23 @@ static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, str
}
template <int type>
static ncclResult_t selectTransportN(struct ncclComm* comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId, int n) {
for (int t=n; t<NTRANSPORTS; t++) {
if (t == TRANSPORT_SHM) continue;
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, NULL, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(comm, NULL, myInfo, peerInfo, connect, connector, channelId));
return ncclSuccess;
}
}
WARN("No transport found !");
return ncclInternalError;
}
template <int type>
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
comm->channels[channelId].peers[peer].recv + connIndex;
int xgmi;
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo));
for (int t=0; t<NTRANSPORTS; t++) {
if (connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P)))
continue;
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
return ncclSuccess;
}
}
@@ -65,17 +55,17 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
return ncclInternalError;
}
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t mask = 1 << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
comm->connectRecv[peer] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
comm->connectSend[peer] |= mask;
}
return ncclSuccess;
@@ -90,10 +80,14 @@ void dumpData(struct ncclConnect* data, int ndata) {
}
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
hipStream_t transportSetupStream;
CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
struct ncclConnect data[2*MAXCHANNELS];
uint32_t p2pNet = LOAD(comm->p2pNet);
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer];
@@ -103,79 +97,173 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int sendChannels = 0, recvChannels = 0;
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
int xgmi = 0;
if (p2pNet && graph == NULL) {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].p2pRecv;
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer));
if (xgmi) {
NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_P2P));
}
else {
NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_NET));
}
}
else {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
}
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex));
}
}
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
int xgmi = 0;
if (p2pNet && graph == NULL) {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].p2pSend;
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer));
if (xgmi) {
NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_P2P));
}
else {
NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_NET));
}
}
else {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
}
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex));
}
}
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
sendData = data;
recvData = data+sendChannels;
}
} else {
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
}
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = (p2pNet && graph == NULL) ? &comm->channels[c].peers[sendPeer].p2pSend
: &comm->channels[c].peers[sendPeer].send;
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
conn->connected = 1;
if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].p2pSend, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
}
}
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = (p2pNet && graph == NULL) ? &comm->channels[c].peers[recvPeer].p2pRecv
: &comm->channels[c].peers[recvPeer].recv;
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
conn->connected = 1;
if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].p2pRecv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
}
}
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
}
CUDACHECK(hipStreamSynchronize(transportSetupStream));
CUDACHECK(hipStreamDestroy(transportSetupStream));
return ncclSuccess;
}
extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// return: 0 - unsupported, 1 - supported
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) {
int rank = comm->rank;
int nranks = comm->nRanks;
int nMasters = comm->nNodes;
int rankInCollNet = -1;
int supported = 0;
int isMaster = (rank == masterRank) ? 1 : 0;
struct {
int collNetRank;
ncclConnect connect;
} sendrecvExchange;
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
peerInfo->rank = nranks;
int ret = 1;
if (isMaster) {
NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
}
// send master receives connect info from peer recv master
if (isMaster && type == collNetSend) {
NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
rankInCollNet = sendrecvExchange.collNetRank;
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
// select
struct ncclPeer* root = channel->peers+nranks;
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster && ret > 0) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
ncclResult_t res;
struct {
int isMaster;
ncclConnect connect;
} *allConnects = NULL;
ncclConnect *masterConnects = NULL;
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == collNetRecv) { // recv side: AllGather
// all ranks must participate
NCCLCHECK(ncclCalloc(&allConnects, nranks));
allConnects[rank].isMaster = isMaster;
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
// consolidate
int c = 0;
for (int r = 0; r < nranks; r++) {
if (allConnects[r].isMaster) {
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
if (r == rank) rankInCollNet = c;
c++;
}
}
} else { // send side : copy in connect info received from peer recv master
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
if (isMaster && ret > 0) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == collNetRecv) {
sendrecvExchange.collNetRank = rankInCollNet;
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (ret > 0) {
supported = 1;
}
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
return supported;
}
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
int rank = comm->rank;
int nranks = comm->nRanks;
// AllGather collNet setup results
int* allGatherFailures;
NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
allGatherFailures[rank] = collNetSetupFail;
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
for (int i=0; i<nranks; i++) {
if (allGatherFailures[i] != 0) {
collNetSetupFail = 1;
break;
}
}
free(allGatherFailures);
if (collNetSetupFail) {
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+nranks;
if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
peer->send->transportResources = NULL; // avoid double free
peer->recv->transportResources = NULL; // avoid double free
}
// Set support to 0
comm->collNetSupport = 0;
}
return ncclSuccess;
}
+337 -209
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,17 @@
#include "comm.h"
#include "coll_net.h"
#include "graph.h"
#include <assert.h>
#include <hsa/hsa_ext_amd.h>
#define COLLNET_GROUP_NSUBS 8
#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
struct collNetRecvConnectInfo {
collNetHandle_t collNetHandle;
};
struct collNetSendConnectInfo {
collNetHandle_t collNetHandle;
void* collNetComm;
void* mhandles[NCCL_NUM_PROTOCOLS];
struct reqSlot* reqFifo;
void* reqFifo;
};
struct reqSlot {
@@ -28,10 +27,10 @@ struct reqSlot {
};
struct collNetSendResources {
void* collNetSendComm;
struct ncclComm* comm;
void* collNetComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
uint32_t* llData;
int netDev;
int useGdr;
void* sendMhandles[NCCL_NUM_PROTOCOLS];
@@ -39,47 +38,68 @@ struct collNetSendResources {
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
struct reqSlot* reqFifo;
struct reqSlot (*reqFifo)[NCCL_STEPS];
int collNetRank;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct collNetRecvResources {
void* netListenComm;
void* collNetRecvComm;
struct ncclComm* comm;
void* collNetComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
uint32_t* llData;
int netDev;
int useGdr;
void* mhandles[NCCL_NUM_PROTOCOLS];
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
struct reqSlot* reqFifo;
struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
int collNetRank;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct collNetSharedResources {
void* collNetListenComms[MAXCHANNELS];
void* collNetComms[MAXCHANNELS];
int collNetCommRefCount[MAXCHANNELS];
};
/* Determine if we can communicate with the peer */
ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
return ncclSuccess;
}
ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
if (resources == NULL) {
NCCLCHECK(ncclCalloc(&resources, 1));
comm->proxyState.sharedBuffs.collNetResources = resources;
}
if (resources->collNetComms[netDev] == NULL)
NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
return ncclSuccess;
}
/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct collNetSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
send->conn.shared = 1;
resources->comm = comm;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1;
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
int recvSize = offsetof(struct ncclRecvMem, buff);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
// Simple uses shared buffers and we don't support LL128
recvSize += send->comm->buffSizes[NCCL_PROTO_LL];
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr));
@@ -87,38 +107,64 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
send->conn.curr_hdp_reg = resources->curr_hdp_reg;
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
/* Setup recv connector */
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct collNetRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
recv->conn.shared = 1;
resources->comm = comm;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev;
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
int recvSize = offsetof(struct ncclRecvMem, buff);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
// Simple uses shared buffers and we don't support LL128
recvSize += recv->comm->buffSizes[NCCL_PROTO_LL];
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr));
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(collNetListen(resources->netDev, &info->collNetHandle, &resources->netListenComm));
NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
return ncclSuccess;
}
ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
if (resources->collNetComms[netDev] == NULL) {
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
NCCLCHECK(ncclCalloc(&handlePtrs, nranks));
for (int i = 0; i < nranks; i++) {
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
resources->collNetListenComms[netDev],
resources->collNetComms+netDev);
free(handlePtrs);
NCCLCHECK(ret);
// Close listen comm
NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
}
*collNetComm = resources->collNetComms[netDev];
resources->collNetCommRefCount[netDev]++;
return ncclSuccess;
}
@@ -128,33 +174,40 @@ ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* conne
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
offset += send->comm->buffSizes[p];
}
send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff;
send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->recvMem->tail;
send->conn.sizesFifo = resources->recvMem->sizesFifo;
send->conn.ptrsFifo = resources->recvMem->ptrsFifo;
send->conn.head = &resources->sendMem->head;
resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
// Get info from recv side
resources->collNetRank = rank;
resources->reqFifo = info->reqFifo;
resources->collNetSendComm = info->collNetComm;
resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
resources->recvMhandles[p] = info->mhandles[p];
// Register buffers
NCCLCHECK(collNetRegMr(resources->collNetSendComm, send->conn.buffs[NCCL_PROTO_SIMPLE], send->comm->buffSizes[NCCL_PROTO_SIMPLE],
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(collNetRegMr(resources->collNetSendComm, resources->llData, send->comm->buffSizes[NCCL_PROTO_LL]/2,
NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
int size;
char* ptr;
// Allocate & Register shared buffers for the Simple protocol
NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr));
NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
// Allocate & Register shared buffers for the LL protocol
NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr));
NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_LL]));
return ncclSuccess;
}
@@ -175,52 +228,57 @@ ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* conne
// Head/Tail/Opcount are always on host
recv->conn.tail = &resources->recvMem->tail;
recv->conn.ptrsFifo = resources->recvMem->ptrsFifo;
recv->conn.head = &resources->sendMem->head;
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
NCCLCHECK(ncclCalloc(&handlePtrs, nranks));
for (int i = 0; i < nranks; i++) {
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t res;
NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, resources->netListenComm, &resources->collNetRecvComm), res, cleanup);
NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
// Register buffers
NCCLCHECK(collNetRegMr(resources->collNetRecvComm, recv->conn.buffs[NCCL_PROTO_SIMPLE], recv->comm->buffSizes[NCCL_PROTO_SIMPLE],
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE]));
NCCLCHECK(collNetRegMr(resources->collNetRecvComm, resources->llData, recv->comm->buffSizes[NCCL_PROTO_LL]/2,
NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_LL]));
int size;
char* ptr;
// Create shared info between send and recv proxies
NCCLCHECK(ncclCalloc(&(resources->reqFifo), NCCL_STEPS));
// Allocate & Register shared buffers for the Simple protocol
NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr));
NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
// Allocate & Register shared buffers for the LL protocol
NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr));
NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_LL]));
// Pass info to send side
info->reqFifo = resources->reqFifo;
info->collNetComm = resources->collNetRecvComm;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
info->mhandles[p] = resources->mhandles[p];
cleanup:
if (handlePtrs != NULL) free(handlePtrs);
// Close listen comm
NCCLCHECK(collNetCloseListen(resources->netListenComm));
return ncclSuccess;
}
return res;
ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) {
struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
resources->collNetCommRefCount[netDev]--;
if (resources->collNetCommRefCount[netDev] == 0) {
NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
}
for (int c=0; c<MAXCHANNELS; c++) if (resources->collNetCommRefCount[c]) return ncclSuccess;
comm->proxyState.sharedBuffs.collNetResources = NULL;
free(resources);
return ncclSuccess;
}
ncclResult_t collNetSendFree(void* sendTransportResources) {
struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
if (resources->collNetSendComm) {
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
if (resources->collNetComm) {
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
}
if (resources->useGdr)
CUDACHECK(hipFree(resources->devRecvMem));
free(resources->llData);
if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem));
NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
free(resources);
return ncclSuccess;
}
@@ -228,115 +286,151 @@ ncclResult_t collNetSendFree(void* sendTransportResources) {
ncclResult_t collNetRecvFree(void* recvTransportResources) {
struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
if (resources->collNetRecvComm) {
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
}
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
if (resources->useGdr)
CUDACHECK(hipFree(resources->devRecvMem));
free(resources->llData);
free(resources->reqFifo);
// Make sure SendFree is called before RecvFree
if (resources->collNetRecvComm) {
NCCLCHECK(collNetCloseColl(resources->collNetRecvComm));
if (resources->collNetComm) {
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
}
if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem));
NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
free(resources);
return ncclSuccess;
}
#define LAST_OF_GROUP(s) \
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
if (args->protocol == NCCL_PROTO_LL128) {
WARN("CollNet does not support LL128");
return ncclInternalError;
}
struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->posted = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->transmitted = sub->done = 0;
resources->step = sub->base + sub->nsteps;
}
args->state = ncclProxyOpProgress;
args->hdp_flushed = 0;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* sendMhandle = resources->sendMhandles[p];
void* recvMhandle = resources->recvMhandles[p];
struct reqSlot* reqFifo = resources->reqFifo;
int buffSlot = args->transmitted%NCCL_STEPS;
if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS
&& LOAD(&reqFifo[buffSlot].recvBuff) != NULL) {
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = LOAD(sizesFifo+buffSlot);
char* buff = localBuff+buffSlot*stepSize;
int ready = 1;
if (args->protocol == NCCL_PROTO_LL) {
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
// Pack data into another buffer
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
buff = (char*)sendBuff;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *d1 = &lines[i].data1;
volatile uint32_t *f2 = &lines[i].flag2;
volatile uint32_t *d2 = &lines[i].data2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
sendBuff[2*i] = LOAD(d1);
sendBuff[2*i+1] = LOAD(d2);
}
size = nFifoLines*2*sizeof(uint32_t);
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
int perGroupSteps = NCCL_STEPS / nGroups;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
void* sendMhandle = resources->sendMhandles[p];
void* recvMhandle = resources->recvMhandles[p];
int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
auto reqFifo = resources->reqFifo;
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
if (p == NCCL_PROTO_SIMPLE) {
char* ptr;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr));
resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize;
__sync_synchronize();
}
if (ready) {
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
volatile uint64_t* sendHead = &resources->sendMem->head;
sub->posted += args->sliceSteps;
*sendHead = sub->base + sub->posted - NCCL_STEPS;
}
// Enforce sync between operations of the same group.
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) {
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
int sharedBuffSlot = sub->received%NCCL_STEPS;
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) {
// We have something to receive, let's check whether data is ready.
int size = sizesFifo[buffSlot];
int ready = 1;
if (s == 0) {
NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot]));
args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2;
}
// Data is ready, try to send.
int count = size/ncclTypeSize(args->dtype);
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->transmitted += args->sliceSteps;
if (p == NCCL_PROTO_LL) {
char* localBuff = sub->connector->conn.buffs[p];
uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1);
int nFifoLines = size / sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
// Pack data into the shared buffer
uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s);
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *d1 = &lines[i].data1;
volatile uint32_t *f2 = &lines[i].flag2;
volatile uint32_t *d2 = &lines[i].data2;
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
sendBuff[2*i] = d1[0];
sendBuff[2*i+1] = d2[0];
}
}
if (ready) {
sizesFifo[buffSlot] = -1;
sub->received += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
//continue;
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
}
}
}
}
}
// Check whether the network has completed some send operations.
if (args->done < args->transmitted) {
int done, size;
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
STORE(&reqFifo[buffSlot].size, size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
// (reordered store after store is possible on POWER, though not on x86)
__sync_synchronize();
STORE(&reqFifo[buffSlot].recvBuff, NULL); // Notify recvProxy
args->done += args->sliceSteps;
resources->sendMem->head = args->done;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) {
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
if (reqFifo[group][buffSlot].recvBuff != NULL) {
int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
int count = totalSize / ncclTypeSize(args->dtype);
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] == NULL) continue;
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
sub->transmitted += args->sliceSteps;
args->idle = 0;
continue;
}
}
// Check whether the network has completed some send operations.
if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) {
int done, size;
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
// (reordered store after store is possible on POWER, though not on x86)
__sync_synchronize();
reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps;
args->idle = 0;
int allDone = 1;
for (int i=0; i<args->nsubs; i++) {
if (args->subs[i].done < args->subs[i].nsteps) { allDone = 0; break; }
}
if (allDone) {
args->state = ncclProxyOpNone;
TRACE(NCCL_NET, "sendProxy [%lu/%d] stopped", sub->done, s);
}
}
return ncclSuccess;
}
}
}
@@ -348,81 +442,115 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
WARN("CollNet does not support LL128");
return ncclInternalError;
}
struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->posted = args->received = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
resources->step = sub->base + sub->nsteps;
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = resources->mhandles[p];
struct reqSlot* reqFifo = resources->reqFifo;
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
int buffSlot = args->posted%NCCL_STEPS;
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
if (args->posted > args->received) {
int buffSlot = args->received%NCCL_STEPS;
if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
if (args->protocol == NCCL_PROTO_LL) { // ll
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
int perGroupSteps = NCCL_STEPS / nGroups;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
void* mhandle = resources->mhandles[p];
int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
auto reqFifo = resources->reqFifo;
// Enforce sync between operations of the same group.
if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
char* ptr;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, 0, &ptr));
args->sharedBuff[sharedBuffSlot] = ptr;
int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
reqFifo[group][buffSlot].recvBuff = args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize;
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
sub->posted += args->sliceSteps;
args->idle = 0;
continue;
}
if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) {
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
int sharedBuffSlot = sub->received%NCCL_STEPS;
if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size;
int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
sub->received += args->sliceSteps;
if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
char* recvAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize;
NCCLCHECK(collNetIflush(resources->collNetComm, recvAddress, totalSize, mhandle, sub->requests+buffSlot));
} else {
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
}
args->idle = 0;
continue;
}
}
if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) {
// Progress flush operations
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
int done = 1;
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
args->idle = 0;
//continue;
}
}
if (sub->flushed > sub->transmitted) {
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
char* ptr = args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot];
if (p == NCCL_PROTO_SIMPLE) {
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
ptrsFifo[buffSlot] = ptr;
__sync_synchronize();
resources->recvMem->tail = sub->base + sub->flushed;
}
if (p == NCCL_PROTO_LL) { // ll
// re-attach flag
uint32_t flag = NCCL_LL_FLAG(args->received + 1);
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
char* localBuff = sub->connector->conn.buffs[p];
uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1);
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
int nFifoLines = DIVUP(LOAD(&reqFifo[buffSlot].size), 2*sizeof(uint32_t));
uint32_t* recvData = (uint32_t*)ptr;
int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t));
for (int i=0; i<nFifoLines; i++) {
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
}
}
args->received += args->sliceSteps;
if (LOAD(&reqFifo[buffSlot].size) > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, LOAD(&reqFifo[buffSlot].size), mhandle, args->requests+buffSlot));
} else {
args->requests[buffSlot] = NULL;
}
sub->transmitted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
continue;
}
}
if (args->received > args->transmitted) {
// Progress flush operations
int buffSlot = args->transmitted%NCCL_STEPS;
int done = 1;
if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->transmitted += args->sliceSteps;
__sync_synchronize();
resources->recvMem->tail = args->transmitted;
args->idle = 0;
return ncclSuccess;
}
}
if (args->transmitted > args->done) {
// Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have
// reached the same point, otherwise we would start posting buffers to the send proxy before we're done
// processing all the shared buffer.
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done));
volatile uint64_t* sendHead = &resources->sendMem->head;
uint64_t done = LOAD(sendHead);
while (done > args->done &&
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
args->transmitted > args->done) {
args->done += args->sliceSteps;
if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
sub->done += args->sliceSteps;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
if (sub->done == sub->nsteps && s == args->nsubs-1) {
args->state = ncclProxyOpNone;
TRACE(NCCL_NET, "recvProxy [%lu/%d] stopped", sub->done, s);
}
}
}
+301 -226
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +11,7 @@
#include <sys/time.h>
#include "collectives.h"
#include <hsa/hsa_ext_amd.h>
#include "gdrwrap.h"
struct netConnectInfo {
ncclNetHandle_t netHandle;
@@ -41,6 +42,13 @@ struct netRecvResources {
void* netRecvComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// GDRCOPY support
void* gdrMemDesc;
struct ncclRecvMem* devRecvMem;
void* gdrFlushDesc;
int* devFlushMem;
int netDev;
int useGdr;
int shared;
@@ -63,13 +71,16 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
// Send/Receive: Round-robin NICs based on the receiver's CUDA device
int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
@@ -120,20 +131,45 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
return ncclSuccess;
}
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
// GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory
NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1);
// GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers
NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
// Send/Receive: Round-robin NICs based on the receiver's CUDA device
int nicRR = comm->cudaDev;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
// GDRCOPY tail support
if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) {
struct ncclRecvMem* devCudaPtr;
NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc));
// The GDR mapped VA doesn't work on the SMs
recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail;
} else {
recv->conn.tail = &resources->recvMem->tail;
}
// GDRCOPY flush support
#if defined (__x86_64__)
if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) {
int* cudaPtr;
NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc));
}
#endif
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
recv->conn.tail = &resources->recvMem->tail;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
recv->conn.head = &resources->sendMem->head;
@@ -242,6 +278,14 @@ ncclResult_t netSendFree(void* transportResources) {
ncclResult_t netRecvFree(void* transportResources) {
struct netRecvResources* resources = (struct netRecvResources*)transportResources;
// GDRCOPY support
if (resources->gdrFlushDesc) {
NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc));
}
// GDRCOPY support
if (resources->gdrMemDesc) {
NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc));
}
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
for (int l=0; l<LOC_COUNT; l++) {
@@ -260,254 +304,285 @@ ncclResult_t netRecvFree(void* transportResources) {
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->posted = args->transmitted = args->done = args->hdp_flushed = resources->step;
args->end = resources->step + args->nsteps;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
}
args->state = ncclProxyOpProgress;
args->hdp_flushed = 0;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = *(resources->mhandlesProto[p]);
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (args->sendbytes < buffSize) buffSize = args->sendbytes;
// Post buffers to the GPU
if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) {
if (resources->shared) {
char* ptr;
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr));
if (ptr == NULL) return ncclInternalError;
resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr;
__sync_synchronize();
volatile uint64_t* sendHead = &resources->sendMem->head;
args->posted += args->sliceSteps;
STORE(sendHead, args->posted - NCCL_STEPS);
} else args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
// Check whether we received data from the GPU and send it to the network
int buffSlot = args->transmitted%NCCL_STEPS;
if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) {
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = LOAD(sizesFifo+buffSlot);
char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
int ready = 1;
if (args->protocol == NCCL_PROTO_LL128) {
int ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
// called threadfence()
uint64_t flag = args->transmitted + 1;
int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
volatile uint64_t* lines = (volatile uint64_t*)buff;
ready = 1;
for (int i=0; i<nFifoLines; i++) {
if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
}
}
} else if (args->protocol == NCCL_PROTO_LL) {
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
}
}
if (ready) {
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
}
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
#ifdef ENABLE_PROFILING
if (args->channel->active_req == 0) {
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
args->channel->active_req ++;
args->channel->sizes += LOAD(sizesFifo+buffSlot);
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
#endif
TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->transmitted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
}
}
}
// Check whether the network has completed some send operations.
if (args->done < args->transmitted) {
int done;
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
if (args->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
if (delta) {
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
args->channel->bw_count ++;
}
}
}
#endif
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
if (sub->done == sub->nsteps) continue;
struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
void* mhandle = *(resources->mhandlesProto[p]);
int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = sub->connector->conn.buffs[p];
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (sub->sendbytes < buffSize) buffSize = sub->sendbytes;
// Post buffers to the GPU
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
if (resources->shared) {
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr));
}
args->done += args->sliceSteps;
if (resources->shared == 0) {
resources->sendMem->head = args->done;
}
char* ptr;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr));
resources->recvMem->ptrsFifo[buffSlot] = ptr;
__sync_synchronize();
volatile uint64_t* sendHead = &resources->sendMem->head;
sub->posted += args->sliceSteps;
*sendHead = sub->base + sub->posted - NCCL_STEPS;
} else sub->posted += args->sliceSteps;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
return ncclSuccess;
continue;
}
// Check whether we received data from the GPU and send it to the network
if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) {
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = sizesFifo[buffSlot];
char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
int ready = 1;
if (p == NCCL_PROTO_LL128) {
ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
// called threadfence()
uint64_t flag = sub->base+sub->transmitted+1;
int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
volatile uint64_t* lines = (volatile uint64_t*)buff;
ready = 1;
for (int i=0; i<nFifoLines; i++) {
if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
}
}
} else if (p == NCCL_PROTO_LL) {
uint32_t flag = NCCL_LL_FLAG(sub->base+sub->transmitted+1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
}
}
if (ready) {
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
}
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
#ifdef ENABLE_PROFILING
if (sub->channel->active_req == 0) {
gettimeofday(&sub->channel->tvs, NULL);
sub->channel->sizes = 0;
}
sub->channel->active_req ++;
sub->channel->sizes += LOAD(sizesFifo+buffSlot);
sub->channel->send_byte += LOAD(sizesFifo+buffSlot);
#endif
TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
sub->transmitted += args->sliceSteps;
args->idle = 0;
continue;
}
}
}
}
// Check whether the network has completed some send operations.
if (sub->done < sub->transmitted) {
int done;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
sub->channel->active_req --;
if (sub->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec;
if (delta) {
sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3;
sub->channel->bw_count ++;
}
}
}
#endif
sub->done += args->sliceSteps;
if (resources->shared == 0) {
resources->sendMem->head = sub->base + sub->done;
}
args->idle = 0;
if (sub->done == sub->nsteps) {
resources->step = sub->base + sub->nsteps;
args->done++;
}
}
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
}
ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->posted = args->received = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->transmitted = sub->done = 0;
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = *(resources->mhandlesProto[p]);
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (args->recvbytes < buffSize) buffSize = args->recvbytes;
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
int buffSlot = args->posted%NCCL_STEPS;
char* ptr;
if (resources->shared) {
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr));
if (ptr == NULL) return ncclInternalError;
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
STORE(ptrsFifo+buffSlot, ptr);
} else {
ptr = localBuff+buffSlot*stepSize;
}
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (args->channel->active_req == 0) {
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
args->channel->active_req ++;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
if (sub->done == sub->nsteps) continue;
struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
void* mhandle = *(resources->mhandlesProto[p]);
int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = sub->connector->conn.buffs[p];
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (sub->recvbytes < buffSize) buffSize = sub->recvbytes;
if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
char* ptr;
if (resources->shared) {
int sharedBuffSlot = sub->posted%NCCL_STEPS;
NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr));
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
ptrsFifo[buffSlot] = ptr;
} else {
ptr = localBuff+buffSlot*stepSize;
}
#endif
args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
} else if (resources->shared) {
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
}
}
if (args->posted > args->received) {
int buffSlot = args->received%NCCL_STEPS;
int done, size;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
if (done) {
args->received += args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
args->channel->sizes += size;
args->channel->recv_byte += size;
if (args->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
if (delta) {
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
args->channel->bw_count ++;
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (sub->channel->active_req == 0) {
gettimeofday(&sub->channel->tvs, NULL);
sub->channel->sizes = 0;
}
sub->channel->active_req ++;
}
#endif
sub->posted += args->sliceSteps;
args->idle = 0;
continue;
}
}
if (sub->posted > sub->received) {
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
int done, size;
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size));
if (done) {
sub->received += args->sliceSteps;
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
sub->channel->active_req --;
sub->channel->sizes += size;
sub->channel->recv_byte += size;
if (sub->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec;
if (delta) {
sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3;
sub->channel->bw_count ++;
}
}
}
}
#endif
if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
// Don't pass data to the GPU yet, flush first.
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot));
} else {
args->requests[buffSlot] = NULL;
if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
// Don't pass data to the GPU yet, flush first.
// GDRCOPY support
if (resources->devFlushMem) {
#if defined (__x86_64__)
// Force a PCI-E read from GPU memory
asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax");
#else
WARN("NET: GDR Flush only supported on x86_64");
return ncclInternalError;
#endif
sub->requests[buffSlot] = NULL;
} else {
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot));
}
} else {
sub->requests[buffSlot] = NULL;
}
args->idle = 0;
continue;
}
}
if (sub->received > sub->transmitted) {
// Progress flush operations
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
int done = 1;
if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
if (done) {
sub->transmitted += args->sliceSteps;
__sync_synchronize();
if (resources->devRecvMem) {
// GDRCOPY support: Write updated tail directly to the device memory
resources->devRecvMem->tail = sub->base + sub->transmitted;
wc_store_fence(); // Flush out WC write
} else {
resources->recvMem->tail = sub->base + sub->transmitted;
}
args->idle = 0;
continue;
}
}
if (sub->transmitted > sub->done) {
volatile uint64_t* sendHead = &resources->sendMem->head;
uint64_t done = *sendHead;
while (done > sub->base + sub->done &&
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
sub->transmitted > sub->done) {
sub->done += args->sliceSteps;
args->idle = 0;
if (sub->done == sub->nsteps) {
resources->step = sub->base + sub->nsteps;
args->done++;
}
}
args->idle = 0;
return ncclSuccess;
}
}
if (args->received > args->transmitted) {
// Progress flush operations
int buffSlot = args->transmitted%NCCL_STEPS;
int done = 1;
if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->transmitted += args->sliceSteps;
__sync_synchronize();
resources->recvMem->tail = args->transmitted;
args->idle = 0;
return ncclSuccess;
}
}
if (args->transmitted > args->done) {
volatile uint64_t* sendHead = &resources->sendMem->head;
uint64_t done = LOAD(sendHead);
while (done > args->done &&
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
args->transmitted > args->done) {
if (resources->shared) {
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
}
args->done += args->sliceSteps;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
+32 -28
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -649,22 +649,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->size = size;
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
wr.wr_id = (uint64_t)req;
struct ibv_send_wr wr[2];
memset(&wr[0], 0, sizeof(wr[0]));
wr[0].wr_id = (uint64_t)req;
struct ibv_sge sge;
if (size == 0) {
wr.sg_list = NULL;
wr.num_sge = 0;
wr[0].sg_list = NULL;
wr[0].num_sge = 0;
} else {
sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
wr[0].sg_list = &sge;
wr[0].num_sge = 1;
}
#if USE_RDMA_WRITE == 0
wr.opcode = IBV_WR_SEND;
wr.send_flags = IBV_SEND_SIGNALED;
wr[0].opcode = IBV_WR_SEND;
wr[0].send_flags = IBV_SEND_SIGNALED;
#else
__sync_synchronize(); // order the readyPtr load against rkey load below
// Sanity checks to catch user collective call count/size mismatches
@@ -674,15 +674,11 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
return ncclInternalError;
}
int useAr = 0;
if (size > ncclParamIbArThreshold()) {
useAr = 1;
}
wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED;
wr.wr.rdma.remote_addr = LOAD(&slot->addr);
wr.wr.rdma.rkey = LOAD(&slot->rkey);
wr.imm_data = size; // Send the message size via imm_data
wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr[0].send_flags = IBV_SEND_SIGNALED;
wr[0].wr.rdma.remote_addr = slot->addr;
wr[0].wr.rdma.rkey = slot->rkey;
wr[0].imm_data = size; // Send the message size via imm_data
__sync_synchronize();
#endif
// We must clear slot->ready, but reset other fields to aid
@@ -692,21 +688,29 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0);
comm->fifoHead++;
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
#if USE_RDMA_WRITE
// When using adaptive routing, send the bulk of the data first as an
// RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
// completion.
if (useAr) {
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr.sg_list = NULL;
wr.num_sge = 0;
wr.send_flags |= IBV_SEND_SIGNALED;
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
if (size > ncclParamIbArThreshold()) {
memset(&wr[1], 0, sizeof(wr[1]));
memcpy(&wr[1], &wr[0], sizeof(wr[0]));
wr[1].sg_list = NULL;
wr[1].num_sge = 0;
wr[0].next = &wr[1];
wr[0].opcode = IBV_WR_RDMA_WRITE;
wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr[0].send_flags = 0;
wr[1].send_flags = IBV_SEND_SIGNALED;
}
#endif
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qp, wr, &bad_wr));
*request = req;
return ncclSuccess;
}
+2 -4
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,9 +11,7 @@
#include "net.h"
#include "param.h"
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <poll.h>
#include <limits.h>
+20 -18
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -165,17 +165,12 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct p2pSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
resources->next_hdp_reg = 0;
uint32_t linktype, hops;
@@ -189,16 +184,22 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
}
struct p2pConnectInfo info;
info.read = useRead;
// For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
info.read = (connIndex == 0) ? useRead : 0;
const char* useReadStr = info.read ? "/read" : "";
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
resources->remoteId = -1;
resources->bootstrap = comm->bootstrap;
if (intermediateRank == -1) {
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true));
info.rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU;
if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else {
@@ -224,20 +225,21 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
struct p2pRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
int recvSize = offsetof(struct ncclRecvMem, buff);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
struct p2pConnectInfo info;
info.read = useRead;
// For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
info.read = (connIndex == 0) ? useRead : 0;
int recvSize = offsetof(struct ncclRecvMem, buff);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info.read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
resources->remoteId = -1;
resources->bootstrap = comm->bootstrap;
@@ -245,7 +247,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true));
info.rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
if (info.read == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
} else {
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
}
+3 -4
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -57,8 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
@@ -81,7 +80,7 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
+1 -1
Просмотреть файл
@@ -40,7 +40,7 @@ namespace CorrectnessTests
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send.conn.head));
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
uint64_t *real_head, *fake_head, *fake_h;
// get original head
+1 -1
Просмотреть файл
@@ -41,7 +41,7 @@ namespace CorrectnessTests
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send.conn.head));
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
uint64_t *real_head, *fake_head, *fake_h;
// get original head
+2 -4
Просмотреть файл
@@ -17,6 +17,7 @@ struct allGather1Data_t {
// AllGather3 - begin
struct ncclGraphInfo {
int pattern;
int nChannels;
int sameChannels;
float speedIntra;
float speedInter;
@@ -25,11 +26,8 @@ struct ncclGraphInfo {
};
struct allGather3Data_t{
int cudaCompCap;
int fullCudaCompCap;
int nChannels;
int collNetSupport;
int nc;
int alltoallDisable;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
+18 -12
Просмотреть файл
@@ -84,7 +84,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
if (myInfo->pidHash == peerInfo->pidHash) {
if (myInfo->cudaDev == peerInfo->cudaDev) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
@@ -103,7 +103,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
return ncclSuccess;
}
@@ -126,12 +126,12 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
}
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
return ncclSuccess;
}
@@ -148,10 +148,12 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
return ncclSuccess;
}
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev));
// Send/Receive: Round-robin NICs based on the receiver's CUDA device
int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr));
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
@@ -159,10 +161,14 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
return ncclSuccess;
}
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev));
// Send/Receive: Round-robin NICs based on the receiver's CUDA device
int nicRR = comm->cudaDev;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr));
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
@@ -183,20 +189,20 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
return ncclSuccess;
}
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr));
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr));
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : "");
+3
Просмотреть файл
@@ -191,6 +191,9 @@ int main(int argc,char* argv[])
assert(node_model!=0);
comm[i].topo = node_model->getSystem(i);
bootstrapAllGather(&comm[i], allGather1Data);
// Mark channels as non initialized.
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
NCCLCHECK(ncclCalloc((uint32_t**)&comm[i].p2pNet, 1));
}
struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
+204 -123
Просмотреть файл
@@ -39,9 +39,10 @@ extern NodeModel *node_model;
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
thread_local int ncclDebugNoWarn = 0;
ncclCollNet_t* ncclCollNet = 0;
ncclCollNet_t* ncclCollNet = NULL;
// Get current Compute Capability
int ncclCudaCompCap() {
@@ -80,7 +81,7 @@ void ncclDebugInit() {
if (ncclDebugLevel != -1) return;
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
@@ -97,6 +98,8 @@ void ncclDebugInit() {
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return;
if (ncclDebugLevel < level || ((flags & (NCCL_INIT|NCCL_GRAPH)) == 0)) return;
char buffer[1024];
size_t len = 0;
if (node_model) len = snprintf(buffer, sizeof(buffer),
@@ -140,10 +143,12 @@ ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t *
extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// type: 0 for send, 1 for recv
// return: 0 - unsupported, 1 - supported
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) {
int rank = comm->rank;
int nranks = comm->nRanks;
int nMasters = comm->nNodes;
int rankInCollNet = -1;
int supported = 0;
int isMaster = (rank == masterRank) ? 1 : 0;
@@ -161,21 +166,22 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
}
// send master receives connect info from peer recv master
if (isMaster && type == 0) {
//NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
if (isMaster && type == collNetSend) {
//NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
rankInCollNet = sendrecvExchange.collNetRank;
INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
// select
struct ncclPeer* root = channel->peers+nranks;
struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send;
struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send);
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster && ret > 0) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
ncclResult_t res;
@@ -184,63 +190,89 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
ncclConnect connect;
} *allConnects = NULL;
ncclConnect *masterConnects = NULL;
//NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == 1) { // recv side: AllGather
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == collNetRecv) { // recv side: AllGather
// all ranks must participate
//NCCLCHECK(ncclCalloc(&allConnects, nranks));
//allConnects[rank].isMaster = isMaster;
//memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
NCCLCHECK(ncclCalloc(&allConnects, nranks));
allConnects[rank].isMaster = isMaster;
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
// consolidate
//int c = 0;
//for (int r = 0; r < nranks; r++) {
//if (allConnects[r].isMaster) {
//memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
//if (r == rank) rankInCollNet = c;
//c++;
//}
//}
int c = 0;
for (int r = 0; r < nranks; r++) {
if (allConnects[r].isMaster) {
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
if (r == rank) rankInCollNet = c;
c++;
}
}
} else { // send side : copy in connect info received from peer recv master
//if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
if (isMaster && ret > 0) {
//NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
//struct ncclPeer* devRoot = channel->devPeers+nranks;
//struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
//CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == 1) {
//sendrecvExchange.collNetRank = rankInCollNet;
if (isMaster && type == collNetRecv) {
sendrecvExchange.collNetRank = rankInCollNet;
//memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (ret > 0) {
supported = 1;
}
cleanup:
//if (allConnects != NULL) free(allConnects);
//if (masterConnects != NULL) free(masterConnects);
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
return supported;
}
static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) {
comm->collNetSupport = 1;
return ncclSuccess;
}
void initCollNet() {
if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0)
ncclCollNet = (ncclCollNet_t*)0x12345678;
}
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
int rank = comm->rank;
int nranks = comm->nRanks;
// AllGather collNet setup results
int* allGatherFailures;
NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
allGatherFailures[rank] = collNetSetupFail;
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
for (int i=0; i<nranks; i++) {
if (allGatherFailures[i] != 0) {
collNetSetupFail = 1;
break;
}
}
free(allGatherFailures);
if (collNetSetupFail) {
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+nranks;
//if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources));
//if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources));
peer->send->transportResources = NULL; // avoid double free
peer->recv->transportResources = NULL; // avoid double free
}
// Set support to 0
comm->collNetSupport = 0;
}
return ncclSuccess;
}
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
int rank = comm->rank;
int nranks = comm->nRanks;
//uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
//TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
//NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
@@ -301,12 +333,11 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
}
struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm;
//free(allGather1Data);
// AllGather1 - end
// Topo detection / System graph creation
//NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
// save nRanks to ncclTopoSystem as indicator of multi-node
comm->topo->nRanks = comm->nRanks;
// Compute paths between GPUs and NICs
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
@@ -345,7 +376,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
collNetGraph.collNet = 1;
collNetGraph.crossNic = ncclParamCrossNic();
collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
collNetGraph.minChannels = 1;
collNetGraph.maxChannels = ringGraph.nChannels;
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
@@ -354,10 +386,26 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
}
// Determine CollNet support
if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
if (intraRanks > 8) {
if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
comm->collNetSupport = 0;
}
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
if (rcclParamP2pNetDisable() == 0) {
STORE(comm->p2pNet, 1);
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
}
else
INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
}
// AllGather3 - begin
#if 0
struct ncclGraphInfo {
int pattern;
int nChannels;
int sameChannels;
float speedIntra;
float speedInter;
@@ -366,9 +414,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
};
struct {
int cudaCompCap;
int fullCudaCompCap;
int nChannels;
int collNetSupport;
int nc;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
@@ -380,52 +426,44 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
#endif
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].nc = comm->nChannels*2;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4);
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
allGather3Data[rank].nc = comm->nChannels*4;
if (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count && (comm->topo->type & RCCL_TOPO_4P2H_ROME))
allGather3Data[rank].nc = (comm->topo->nodes[NET].count > 3 ? 2 : 4)*comm->topo->nodes[NET].count;
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = 6;
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
allGather3Data[rank].ring.pattern = ringGraph.pattern;
allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
allGather3Data[rank].collNetSupport = comm->collNetSupport;
// CollNet channels are already duplicated
comm->collNetnChannels = 2*collNetGraph.nChannels;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
return ncclSuccess;
}
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
if (channel->id != -1) return ncclSuccess;
channel->id = channelid;
// Setup intermediate buffering
//int buffSize = ncclParamBuffsize();
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
//channel->buffSize = buffSize != -2 ? buffSize :
// cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
// Ring index to user rank table.
//NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
@@ -434,12 +472,25 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
//NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
for (size_t i=0; i<comm->nRanks+1; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
channel->peers[i].send[b].comm = comm;
channel->peers[i].recv[b].comm = comm;
}
}
// Per-channel operation list.
//NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
//NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
//if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
// GDRCOPY support
// We allocate a workFifo in GDR mapped CUDA memory
// But we still allocate the Host workFifo so that we
// can copy the work elements to CUDA memory on kernel launch
//NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
//} else {
// The device workFifo is the Host one
//channel->workFifoDev = channel->workFifo;
//}
return ncclSuccess;
}
@@ -461,16 +512,34 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
return ncclSuccess;
}
static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
if (info1->hostHash != info2->hostHash) return ncclSuccess;
int g1, g2;
NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1));
NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2));
if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1;
return ncclSuccess;
}
template <int type>
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
comm->channels[channelId].peers[peer].recv + connIndex;
int xgmi;
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo));
for (int t=0; t<NTRANSPORTS; t++) {
if (connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P)))
continue;
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
return ncclSuccess;
}
}
@@ -478,25 +547,30 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
return ncclInternalError;
}
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t mask = 1 << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
comm->connectRecv[peer] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
comm->connectSend[peer] |= mask;
}
return ncclSuccess;
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
//hipStream_t transportSetupStream;
//CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
struct ncclConnect data[2*MAXCHANNELS];
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer];
@@ -506,50 +580,50 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int sendChannels = 0, recvChannels = 0;
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex));
}
}
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex));
}
}
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
//NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
//NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
//NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
//NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
sendData = data;
recvData = data+sendChannels;
}
} else {
//if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
//if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
//if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
//if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
}
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
conn->connected = 1;
//CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
}
}
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
conn->connected = 1;
//CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
}
}
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
}
//CUDACHECK(hipStreamSynchronize(transportSetupStream));
//CUDACHECK(hipStreamDestroy(transportSetupStream));
return ncclSuccess;
}
@@ -587,24 +661,28 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
allTopoRanks[i] = &allGather3Data[i].topoRanks;
nc = std::min(allGather3Data[i].nc, nc);
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
}
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
@@ -613,15 +691,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, nc));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
} else {
comm->collNetnChannels = 0;
}
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc));
free(allTopoRanks);
free(nodesTreePatterns);
@@ -657,46 +727,58 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all rings");
// Connect Trees
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all trees");
// Check if we can setup CollNet
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
for (int c=comm->nChannels; c<comm->collNetnChannels; c++)
NCCLCHECK(initChannel(comm, c));;
int logicChannels = comm->collNetnChannels/2;
if (comm->collNetSupport > 0) {
int collNetSetupFail = 0;
const int recvIndex = 0; // recv GPU index is always 0
const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
for (int c=0; c<logicChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
collNetSetupFail = 1;
else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
collNetSetupFail = 1;
// Find all head ranks
int nHeads = collNetGraph.nChannels;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
// Head GPU index is always 0
for (int c=0; c<nHeads; c++) {
heads[c] = collNetGraph.intra[c*comm->localRanks+0];
}
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int h=0; h<nHeads; h++) {
const int head = heads[h];
if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv) != 1)
collNetSetupFail = 1;
else if (ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend) != 1)
collNetSetupFail = 1;
}
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
// Verify CollNet setup across ranks
NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
NCCLCHECK(ncclTransportCollNetCheck(comm, collNetSetupFail));
if (comm->collNetSupport) {
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0));
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1));
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1));
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
}
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(rings);
@@ -714,10 +796,9 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
affinity_restore:
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
//sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
if (ret != ncclSuccess) return ret;
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}