Add LL128 Protocol.

Rewrite the topology detection and tree/ring creation (#179). Improve
tree performance by sending/receiving from different GPUs. Add
model-based tuning to switch between the different algorithms and
protocols.

Rework P2P/SHM detection in containers (#155, #248).

Detect duplicated devices and return an error (#231).

Add tuning for GCP

[ROCm/rccl commit: 299c554dcc]
This commit is contained in:
Sylvain Jeaugey
2019-11-19 14:57:39 -08:00
committed by GitHub
szülő 221b65bee1
commit 71560fd67b
65 fájl változott, egészen pontosan 4783 új sor hozzáadva és 2832 régi sor törölve
+5 -3
Fájl megtekintése
@@ -25,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# Better define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_35,code=sm_35 \
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
@@ -46,7 +45,10 @@ endif
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
CXXFLAGS += -I $(CUDA_INC)
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
+2 -2
Fájl megtekintése
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 4
NCCL_PATCH := 8
NCCL_MINOR := 5
NCCL_PATCH := 6
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Fájl megtekintése
@@ -17,7 +17,7 @@ DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
PKG_TIMESTAMP := $(shell date -R)
ARCH := $(shell uname -m)
PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g"| sed -e "s/aarch64/arm64/g")
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
ifeq ($(PKG_MULTIARCH),)
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+8 -7
Fájl megtekintése
@@ -9,10 +9,11 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
##### lib files
LIBNAME := libnccl.so
@@ -94,17 +95,17 @@ $(PKGDIR)/nccl.pc : nccl.pc.in
$(INCDIR)/%.h : %.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
install -m 644 $< $@
$(INCDIR)/nccl_%.h : include/nccl_%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
install -m 644 $< $@
$(PKGDIR)/%.pc : %.pc
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(PKGDIR)
cp -f $< $@
install -m 644 $< $@
$(OBJDIR)/%.o : %.cc
@printf "Compiling %-35s > %s\n" $< $@
@@ -117,8 +118,8 @@ $(OBJDIR)/%.o : %.cc
@rm -f $(@:%.o=%.d.tmp)
clean :
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
$(MAKE) -C collectives/device clean
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
install : lib
mkdir -p $(PREFIX)/lib
+50 -58
Fájl megtekintése
@@ -13,11 +13,6 @@
#include <unistd.h>
#include <sys/types.h>
// Always use sockets for bootstrap
struct bootstrapNetHandle {
union socketAddress connectAddr;
};
struct bootstrapNetComm {
int fd;
};
@@ -68,36 +63,36 @@ static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
// if dev >= 0, listen based on dev
if (dev >= 0) {
NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
} else if (dev == findSubnetIf) {
// handle stores a remote address
// need to find a local addr that is in the same network as the remote addr
union socketAddress localAddr;
char ifName[MAX_IF_NAME_SIZE];
if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
// pass the local address back
memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
memcpy(connectAddr, &localAddr, sizeof(localAddr));
} // Otherwise, handle stores a local address
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
*listenComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
*sendComm = comm;
return ncclSuccess;
}
@@ -145,21 +140,12 @@ static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
return ncclSuccess;
}
ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
return ncclSuccess;
}
struct extId {
ncclNetHandle_t extHandleRoot;
void* extListenComm;
uint64_t hostHash;
pid_t pid;
int fd;
pthread_t boostrapThread;
};
struct extInfo {
int rank;
int nranks;
@@ -177,9 +163,8 @@ static ncclResult_t setFilesLimit() {
return ncclSuccess;
}
static void *bootstrapRoot(void* commId) {
static void *bootstrapRoot(void* listenComm) {
struct extInfo info;
struct extId* id = (struct extId*)commId;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
@@ -191,7 +176,7 @@ static void *bootstrapRoot(void* commId) {
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
@@ -216,22 +201,22 @@ static void *bootstrapRoot(void* commId) {
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
++c;
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
} while (c < nranks);
TRACE(NCCL_INIT, "COLLECTED HANDLES");
TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
}
TRACE(NCCL_INIT, "SENT OUT HANDLES");
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
bootstrapNetCloseListen(id->extListenComm);
free(commId);
bootstrapNetCloseListen(listenComm);
if (rankHandles) free(rankHandles);
if (rankHandlesRoot) free(rankHandlesRoot);
@@ -239,31 +224,28 @@ out:
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
struct extId* id = (struct extId*)commId;
id->hostHash = getHostHash();
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
ncclUniqueId* threadIdCopy;
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
void* listenComm;
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
pthread_t thread;
pthread_create(&thread, NULL, bootstrapRoot, listenComm);
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
extId* id = (extId*)out;
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
memset(id, 0, sizeof(ncclUniqueId));
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
char* env = getenv("NCCL_COMM_ID");
if (env) {
if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
id->pid = -1;
} else {
id->pid = getpid();
NCCLCHECK(bootstrapCreateRoot(out, false));
NCCLCHECK(bootstrapCreateRoot(id, false));
}
return ncclSuccess;
@@ -286,9 +268,9 @@ struct extState {
int dev;
};
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
struct extId* id = (struct extId*)commId;
bool idFromEnv = id->pid < 0;
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
@@ -303,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
@@ -323,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
}
// send info on my listening socket to root
NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -334,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
@@ -377,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpSendComm;
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -465,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) {
return ncclSuccess;
}
ncclResult_t bootstrapAbort(void* commState) {
struct extState* state = (struct extState*)commState;
bootstrapNetCloseListen(state->extBstrapListenComm);
bootstrapNetCloseSend(state->extBstrapRingSendComm);
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
free(state->peerBstrapHandles);
free(state);
return ncclSuccess;
}
@@ -5,7 +5,6 @@
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
@@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
clean:
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
@@ -11,7 +11,7 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -19,15 +19,15 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -11,7 +11,7 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
@@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = args->lastChunkSize;
int chunkSize = args->lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;
if (loopSize > size) {
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
} while(0);
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
// Compute pointers
@@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
@@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
@@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
@@ -216,16 +221,21 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;
if (loopSize > size) {
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
} while(0);
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
}
} while(0);
}
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* treeUp = &channel->treeUp;
struct ncclTree* treeDn = &channel->treeDn;
const ssize_t size = args->N;
ssize_t chunkSize = args->lastChunkSize;
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
const ssize_t loopSize = args->nChannels*chunkSize;
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
if (loopSize > size) {
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
if (treeUp->up == -1) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
}
} else {
if (tid < nthreadsSplit) {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (treeUp->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
} else {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (treeDn->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
}
@@ -11,7 +11,7 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -7,9 +7,8 @@
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
#include "../collectives.h"
#include "collectives.h"
#include "devcomm.h"
#include "nccl.h"
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
@@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[];
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
// When aggregation is effective, if some threads have aborted inside the LL kernel,
// make sure the rest of the threads abort as well
exitIfAbortBarrier(0);
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
__syncthreads();
}
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? *(comm->abortFlag) : 0;
exitIfAbortBarrier(abort);
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
__syncthreads();
if (tid == 0) hostColl->active = 0;
}
extern __device__ volatile uint64_t* ncclShmem;
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
@@ -51,10 +52,11 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
#if NCCL_OP == 0
/* Kernels with the first operation inlined */
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
ncclShmem = shmem; \
__shared__ struct ncclColl localColl; \
\
struct ncclDevComm* comm = firstColl.args.comm; \
@@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
c = &firstColl; \
} else { \
c = &localColl; \
load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
} \
while (1) { \
if (tid < c->args.nThreads) { \
@@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
load_coll(c, channel->devCollectives+nextIndex, tid); \
load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
} \
}
#else
@@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
// Only generate inline kernels for LL
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
#if NCCL_TYPE == 0
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
@@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
}
}
#define WARP_SIZE 32
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
@@ -8,13 +8,16 @@
#include "collectives.h"
#include "common.h"
__device__ volatile uint64_t* ncclShmem;
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype)
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL128, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##Tree, op, dtype)
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -50,7 +53,7 @@
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
// Must be consistent with ncclColl_t
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
@@ -59,7 +62,7 @@
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
@@ -0,0 +1,36 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef OP128_H_
#define OP128_H_
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(ptr));
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(ptr));
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
uint64_t* shmemAsmPtr;
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
return shmemAsmPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
}
#endif
@@ -37,15 +37,27 @@ class ncclPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
const int stepSize;
int nrecv = 0;
int nsend = 0;
const int stepSize;
struct ncclConnInfo* recvConn[NRECV];
struct ncclConnInfo* sendConn[NSEND];
volatile uint64_t* waitPtr;
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
volatile uint64_t* recvConnTailPtr = NULL;
uint64_t recvConnTail;
uint64_t recvConnTailCache; // Cache last seen value
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnTailPtr = NULL;
uint64_t sendConnTail;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
uint64_t sendConnHead[NSEND];
const T* recvDirectBuff[NRECV];
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
@@ -60,15 +72,18 @@ class ncclPrimitives {
inline __device__ void barrier() {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
inline __device__ void subBarrier() {
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
if (mismatch) {
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
*(comm->fatalDevError) = ncclDevAssertedMismatch;
} else if (remoteOpCount && *remoteOpCount > opCount) {
} else if (conn && *conn->opCountRem > opCount) {
mismatch += 1;
}
}
@@ -76,49 +91,55 @@ class ncclPrimitives {
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
inline __device__ int checkAbort(int i, int send) {
spins++;
if (spins == SPINS_BEFORE_CHECK_ABORT) {
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
checkMismatch(remoteOpCount);
if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
}
inline __device__ void waitRecv(int i) {
inline __device__ void waitSend(int nbytes) {
spins = 0;
mismatch = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
sendConnHeadCache = *sendConnHeadPtr;
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
}
sendConnHead += SLICESTEPS;
}
}
inline __device__ void waitRecv() {
spins = 0;
mismatch = 0;
if (recvConnTailPtr) {
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
recvConnTailCache = *recvConnTailPtr;
if (checkAbort(wid, 0)) break;
}
recvConnTail += SLICESTEPS;
}
}
inline __device__ void incRecv(int i) {
recvStep[i] += SLICESTEPS;
if (tid == i) {
while (*(waitPtr) < recvStep[i]) {
if (checkAbort(recvConn[i]->opCountRem)) break;
}
}
}
inline __device__ void postRecv() {
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
}
inline __device__ void waitSend(int i) {
spins = 0;
mismatch = 0;
inline __device__ void incSend(int i) {
sendStep[i] += SLICESTEPS;
if (tid == WARP_SIZE+i) {
while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
sendConnHead[i] = *waitPtr;
if (checkAbort(sendConn[i]->opCountRem)) break;
}
}
}
inline __device__ void postRecv(int i) {
*(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
}
inline __device__ void postSend(int i) {
*(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
}
inline __device__ void postSendSize(int i, int size) {
if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
inline __device__ void postSend() {
if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
}
template <int DIRECTRECV>
@@ -131,11 +152,22 @@ class ncclPrimitives {
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
}
template <int DIRECTRECV>
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
}
template <int DIRECTSEND>
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
int offset = 0;
int sliceSize = stepSize * SLICESTEPS;
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
const T* srcs[RECV*NRECV+SRC];
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
@@ -151,101 +183,126 @@ class ncclPrimitives {
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
#pragma unroll 1
bool syncThread = tid >= nthreads-WARP_SIZE;
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(sliceSize, nelem-offset));
if (tid < nthreads) {
FOR_SEND(waitSend);
FOR_RECV(waitRecv);
int realSize = max(0, min(dataSize, nelem-offset));
if (!syncThread) {
if (SEND) waitSend(realSize*sizeof(T));
if (RECV) waitRecv();
if (realSize > 0) {
barrier();
subBarrier();
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
}
exitIfAbortBarrier(abort);
} else {
exitIfAbortBarrier(abort);
FOR_SEND(postSendSize, realSize*sizeof(T));
if (SEND) __threadfence_system();
FOR_SEND(postSend);
FOR_RECV(postRecv);
}
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
offset += sliceSize;
barrier();
FOR_SEND(incSend);
FOR_RECV(incRecv);
if (syncThread) {
if (SEND) {
if (realSize > 0 && wid == 0) __threadfence_system();
__syncwarp();
postSend();
}
if (RECV) postRecv();
}
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
recvConn[i] = conn;
recvBuff[i] = (const T*)recvConn[i]->buff;
recvStep[i] = recvConn[i]->step;
recvBuff[i] = (const T*)conn->buff;
recvStep[i] = conn->step;
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
// Return credits in case we rounded up.
if (tid == nthreads) *recvConn[i]->head = recvStep[i];
if (tid == i) {
waitPtr = recvConn[i]->tail;
*(recvConn[i]->opCountLoc) = opCount;
}
recvDirectBuff[i] = NULL;
if (directBuff && recvConn[i]->direct) {
if (directBuff && conn->direct) {
recvDirectBuff[i] = directBuff;
if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
if (tid == 0) *conn->ptrExchange = directBuff;
}
if (wid == i) recvConn = conn;
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
nrecv++;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
recvConnTailPtr = recvConn->tail;
recvConnTailCache = *recvConnTailPtr;
}
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
// Return credits in case we rounded up.
*recvConnHeadPtr = recvConnHead;
// Update opCount in case we skipped some operations
*(recvConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
sendConn[i] = conn;
sendBuff[i] = (T*)sendConn[i]->buff;
sendStep[i] = sendConn[i]->step;
sendBuff[i] = (T*)conn->buff;
sendStep[i] = conn->step;
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
if (tid == WARP_SIZE+i) {
waitPtr = sendConn[i]->head;
sendConnHead[i] = *waitPtr;
*(sendConn[i]->opCountLoc) = opCount;
}
sendDirectBuff[i] = NULL;
if (directBuff && sendConn[i]->direct) {
void* volatile* ptr = sendConn[i]->ptrExchange;
if (directBuff && conn->direct) {
void* volatile* ptr = conn->ptrExchange;
while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
__syncthreads();
barrier();
if (tid == 0) *ptr = NULL;
}
if (wid == i) sendConn = conn;
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
nsend++;
}
__device__ __forceinline__ void saveRecvConn(int i) {
if (tid == i) {
recvConn[i]->step = recvStep[i];
__threadfence_system();
*(recvConn[i]->opCountLoc) += 1;
__device__ __forceinline__ void loadSendSync() {
if (tid < nsend) {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnFifoPtr = sendConn->fifo;
*(sendConn->opCountLoc) = opCount;
}
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
sendConnTailPtr = sendConn->tail;
}
}
__device__ __forceinline__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
sendConn[i]->step = sendStep[i];
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConn->step = recvConnHead;
*(recvConn->opCountLoc) = opCount+1;
__threadfence_system();
}
}
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
sendConn->step = sendConnHead;
*(sendConn->opCountLoc) = opCount+1;
__threadfence_system();
*(sendConn[i]->opCountLoc) += 1;
}
}
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
__syncthreads();
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
loadRecvSync();
loadSendSync();
}
__device__ __forceinline__ void
@@ -305,267 +362,13 @@ class ncclPrimitives {
}
__device__ __forceinline__ ~ncclPrimitives() {
// Save steps for next collective. Have thread 0 do it to be compatible
// with the way LL works.
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
}
};
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLLPrimitives {
private:
const int tid;
const int nthreads;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn[NRECV];
struct ncclConnInfo* sendConn[NSEND];
volatile uint64_t* waitPtr;
volatile uint64_t* postPtr;
volatile int* fifoPtr;
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
uint64_t sendConnHead;
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
inline __device__ void exitIfAbortLocalBarrier() {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
asm ("}");
if (popc) {
// Make sure threads not participating in the operation get the abort and all threads exit
exitIfAbortBarrier(1);
}
}
inline __device__ void barrier() {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
if (mismatch > 20) {
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
*(comm->fatalDevError) = ncclDevSuspectedMismatch;
} else if (remoteOpCount && *remoteOpCount > opCount) {
mismatch += 1;
}
}
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
spins++;
if (spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
checkMismatch(remoteOpCount);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int i, int nbytes) {
spins = 0;
mismatch = 0;
if (tid == WARP_SIZE+i) {
while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
sendConnHead = *waitPtr;
if (checkAbort(sendConn[i]->opCountRem)) break;
}
if (fifoPtr) {
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
fifoPtr[sendStep[i]%NCCL_STEPS] = size;
}
}
}
inline __device__ void postRecv(int i) {
recvStep[i]++;
if (tid == i) *postPtr = recvStep[i];
}
inline __device__ void postSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
__device__ uint64_t readLL(int i, int offset) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
spins = 0;
mismatch = 0;
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
if (checkAbort(recvConn[i]->opCountRem)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
return val64;
}
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
}
// Using memcpy handles misaligned pointers.
__device__ uint64_t readAL(uint64_t* src) {
uint64_t val;
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
return val;
}
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
memcpy((char*)dst, (char*)&val, nbytes);
}
template <int RECV, int SEND, int SRC, int DST>
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
FOR_SEND(waitSend, nbytes*2);
barrier();
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
int offset = tid;
// Do multiples of 64 bits
#pragma unroll 2
for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
for (int i=1; i<NRECV && i<nrecv; i++) {
val = MULTI<FUNC, T>()(readLL(i, offset), val);
}
}
// Send : inter-node, then intra-node, then local
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
storeLL(sendPtr(0)+offset, val, sendFlag(0));
}
if (DST) {
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
// Last incomplete word
storeAL(dstPack+offset, val, nbytes & 0x7);
} else {
storeAL(dstPack+offset, val, sizeof(uint64_t));
}
}
}
exitIfAbortLocalBarrier();
FOR_RECV(postRecv);
FOR_SEND(postSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvConn[i] = conn;
recvBuff[i] = recvConn[i]->llBuff;
recvStep[i] = recvConn[i]->step;
if (tid == i) {
postPtr = recvConn[i]->head;
*(recvConn[i]->opCountLoc) = opCount;
}
nrecv++;
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendConn[i] = conn;
sendBuff[i] = sendConn[i]->llBuff;
sendStep[i] = sendConn[i]->step;
if (tid == WARP_SIZE+i) {
waitPtr = sendConn[i]->head;
fifoPtr = sendConn[i]->fifo;
sendConnHead = *waitPtr;
*(sendConn[i]->opCountLoc) = opCount;
}
nsend++;
}
__device__ __forceinline__ void saveRecvConn(int i) {
if (tid == i) {
recvConn[i]->step = recvStep[i];
*(recvConn[i]->opCountLoc) += 1;
__threadfence_block();
}
}
__device__ __forceinline__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
sendConn[i]->step = sendStep[i];
*(sendConn[i]->opCountLoc) += 1;
__threadfence_block();
}
}
public:
__device__ __forceinline__
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
}
__device__ void send(const T* src, int nelem) {
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recv(T* dst, int nelem) {
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceSend(const T* src, int nelem) {
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
}
__device__ void copySend(const T* src, T* dst, int nelem) {
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
}
__device__ void recvCopySend(T* dst, int nelem) {
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
}
__device__ __forceinline__ ~ncclLLPrimitives() {
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
saveRecvSync();
saveSendSync();
}
};
#include "prims_ll.h"
//#include "prims_ll128.h"
#endif
@@ -0,0 +1,259 @@
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLLPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
inline __device__ void barrier() {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
if (mismatch > 20) {
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
*(comm->fatalDevError) = ncclDevSuspectedMismatch;
} else if (conn && *conn->opCountRem > opCount) {
mismatch += 1;
}
}
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(int i, int send) {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int nbytes) {
spins = 0;
mismatch = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
sendConnHeadCache = *sendConnHeadPtr;
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
}
sendConnHead += 1;
}
barrier();
}
inline __device__ void incRecv(int i) {
recvStep[i] += 1;
}
inline __device__ void postRecv() {
barrier();
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
}
inline __device__ void incSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
__device__ uint64_t readLL(int i, int offset) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
spins = 0;
mismatch = 0;
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
if (checkAbort(i, 0)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
return val64;
}
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
}
// Using memcpy handles misaligned pointers.
__device__ uint64_t readAL(uint64_t* src) {
uint64_t val;
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
return val;
}
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
memcpy((char*)dst, (char*)&val, nbytes);
}
template <int RECV, int SEND, int SRC, int DST>
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
int offset = tid;
// Always waitSend in case of cleanup
if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine));
// Do multiples of 64 bits
#pragma unroll 2
for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
for (int i=1; i<NRECV && i<nrecv; i++) {
val = MULTI<FUNC, T>()(readLL(i, offset), val);
}
}
// Send : inter-node, then intra-node, then local
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
storeLL(sendPtr(0)+offset, val, sendFlag(0));
}
if (DST) {
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
// Last incomplete word
storeAL(dstPack+offset, val, nbytes & 0x7);
} else {
storeAL(dstPack+offset, val, sizeof(uint64_t));
}
}
}
FOR_RECV(incRecv); if (RECV) postRecv();
FOR_SEND(incSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvBuff[i] = conn->llBuff;
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
// Update opCount in case we skipped some operations
*(recvConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = conn->llBuff;
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < nsend) {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
sendConnFifoPtr = sendConn->fifo;
*(sendConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConn->step = recvConnHead;
*(recvConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
sendConn->step = sendConnHead;
*(sendConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
public:
__device__ __forceinline__
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
loadRecvSync();
loadSendSync();
}
__device__ void send(const T* src, int nelem) {
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recv(T* dst, int nelem) {
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceSend(const T* src, int nelem) {
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
}
__device__ void copySend(const T* src, T* dst, int nelem) {
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
}
__device__ void recvCopySend(T* dst, int nelem) {
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
}
__device__ __forceinline__ ~ncclLLPrimitives() {
// Save steps for the next operation
saveRecvSync();
saveSendSync();
}
};
@@ -0,0 +1,410 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "op128.h"
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLL128Primitives {
private:
const int tid;
const int nthreads;
const int wid;
const int warp;
const bool flagThread;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnTailPtr = NULL;
uint64_t sendConnTail;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
uint64_t* recvBuff[NRECV];
uint64_t* sendBuff[NSEND];
struct ncclDevComm* comm;
volatile uint64_t* shmem;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
inline __device__ void barrier() {
if (NSEND>NRECV) {
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
} else {
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
}
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
if (mismatch > 20) {
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
*(comm->fatalDevError) = ncclDevSuspectedMismatch;
} else if (conn && *conn->opCountRem > opCount) {
mismatch += 1;
}
}
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(int i, int send) {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int nbytes) {
spins = 0;
mismatch = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
sendConnHeadCache = *sendConnHeadPtr;
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes;
}
sendConnHead += 1;
}
}
inline __device__ void incRecv(int i) {
recvStep[i] += 1;
}
inline __device__ void postRecv() {
if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
}
inline __device__ void incSend(int i) {
sendStep[i] += 1;
}
inline __device__ void postSend() {
if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
}
template <int ELEMS_PER_THREAD>
inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) {
#if 0
uint64_t v[ELEMS_PER_THREAD];
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]);
}
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
}
#else
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) {
uint64_t v0, v1;
load128(src64Ptr+u*WARP_SIZE, v0, v1);
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
}
}
#endif
}
inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) {
T* shmemPtr = (T*)(shmem-2*wid);
for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
shmemPtr[offset] = srcPtr[offset];
}
}
template <int ELEMS_PER_THREAD>
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
uint64_t v[ELEMS_PER_THREAD];
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
}
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
}
}
inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) {
T* shmemPtr = (T*)(shmem-2*wid);
for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
dstPtr[offset] = shmemPtr[offset];
}
}
#define WARP_MASK 0xffffffff
template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST>
__device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) {
uint64_t v[ELEMS_PER_THREAD];
/************* Data Loading : SHMEM -> REG **************/
if (SRC) {
volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = shmem64Ptr[u*(WARP_SIZE-2)];
if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1];
}
}
/*********** End Data Loading : SHMEM -> REG ************/
/************************ Recv **************************/
if (RECV) {
uint64_t flag = recvFlag(0);
uint64_t* ptr = recvPtr(0)+ll128Offset;
bool needReload;
uint64_t v0, v1;
do {
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
}
} while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
}
for (int i=1; i<NRECV && i<nrecv; i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
uint64_t v0, v1;
do {
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
}
} while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = MULTI<FUNC, T>()(v0, v[u]);
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
}
}
}
/********************** End Recv ************************/
/************************ Send **************************/
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) {
int flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
}
}
int flag = sendFlag(0);
uint64_t* ptr = sendPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
}
}
/********************** End Send ************************/
/************* Data Storing : REG -> SHMEM **************/
if (DST) {
volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
shmem64Ptr[u*(WARP_SIZE-2)] = v[u];
if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1];
}
}
/*********** End data Storing : REG -> SHMEM ************/
}
#define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD)
#define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS))
template <int RECV, int SEND, int SRC, int DST>
__device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) {
if (nelem <= 0) {
// Don't move any data but still increase steps and sync with prev/next
if (SEND) waitSend(0);
FOR_SEND(incSend); if (SEND) postSend();
FOR_RECV(incRecv); if (RECV) postRecv();
return;
}
const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2;
const uint64_t* src64Ptr = ((uint64_t*)srcPtr);
uint64_t* dst64Ptr = ((uint64_t*)dstPtr);
int ll128Offset = LL128INC*warp+2*wid;
int elemOffset = ELEMINC*warp;
const int nwarps = nthreads/WARP_SIZE;
if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t));
barrier();
while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) {
const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC);
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
if (SRC) {
int done = 0;
if ((((uint64_t)srcPtr)&0xf) == 0) {
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset));
}
__syncwarp();
recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset);
__syncwarp();
if (DST) {
int done = 0;
if ((((uint64_t)dstPtr)&0xf) == 0) {
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset));
}
__syncwarp();
ll128Offset += LL128INC*nwarps;
elemOffset += ELEMINC*nwarps;
}
barrier();
FOR_SEND(incSend); if (SEND) postSend();
FOR_RECV(incRecv); if (RECV) postRecv();
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvBuff[i] = conn->ll128Buff;
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
// Update opCount in case we skipped some operations
*(recvConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = conn->ll128Buff;
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < nsend) {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
sendConnFifoPtr = sendConn->fifo;
*(sendConn->opCountLoc) = opCount;
}
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
if (sendConn->fifo) {
sendConnTailPtr = sendConn->tail;
sendConnTail = sendConn->step;
}
}
}
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConn->step = recvConnHead;
*(recvConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
sendConn->step = sendConnHead;
*(sendConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
public:
__device__ __forceinline__
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
loadRecvSync();
loadSendSync();
}
__device__ void send(const T* src, int nelem) {
return GenericOp<0, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recv(T* dst, int nelem) {
return GenericOp<1, 0, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceSend(const T* src, int nelem) {
return GenericOp<1, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
return GenericOp<1, 0, 1, 1>(src, dst, nelem);
}
__device__ void copySend(const T* src, T* dst, int nelem) {
return GenericOp<0, 1, 1, 1>(src, dst, nelem);
}
__device__ void recvCopySend(T* dst, int nelem) {
return GenericOp<1, 1, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
return GenericOp<1, 1, 1, 1>(src, dst, nelem);
}
__device__ __forceinline__ ~ncclLL128Primitives() {
// Save steps for the next operation
saveRecvSync();
saveSendSync();
}
};
@@ -11,7 +11,7 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
const int rank = comm->rank;
const int nranks = comm->nRanks;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -11,7 +11,7 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -19,7 +19,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
@@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
+169
Fájl megtekintése
@@ -0,0 +1,169 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
int ncclDebugLevel = -1;
thread_local int ncclDebugNoWarn = 0;
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) return;
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
if (ncclDebugSubsysEnv != NULL) {
int invert = 0;
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
ncclDebugMask = invert ? ~0ULL : 0ULL;
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
char *subsys = strtok(ncclDebugSubsys, ",");
while (subsys != NULL) {
uint64_t mask = 0;
if (strcasecmp(subsys, "INIT") == 0) {
mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NCCL_NET;
} else if (strcasecmp(subsys, "GRAPH") == 0) {
mask = NCCL_GRAPH;
} else if (strcasecmp(subsys, "TUNING") == 0) {
mask = NCCL_TUNING;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
free(ncclDebugSubsys);
}
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
}
switch (ncclDebugFileEnv[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
break;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != NULL) {
INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
ncclDebugFile = file;
}
}
}
#ifdef ENABLE_TRACE
ncclEpoch = std::chrono::high_resolution_clock::now();
#endif
pthread_mutex_unlock(&ncclDebugLock);
}
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
char hostname[1024];
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
fprintf(ncclDebugFile,"%s\n", buffer);
fflush(ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugLock);
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
hostname, getpid(), gettid(), cudaDev, filefunc, line);
abort();
}
}
+96 -74
Fájl megtekintése
@@ -5,19 +5,17 @@
************************************************************************/
#include "enqueue.h"
#include "checks.h"
#include "param.h"
#include "collectives/collectives.h"
#include "argcheck.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
(void*)NCCL_FUNC5(coll##Ring, op, dtype), \
(void*)NCCL_FUNC5(coll##Tree, op, dtype)
(void*)NCCL_FUNC5(coll##Tree, op, dtype), \
(void*)NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -54,7 +52,7 @@
NCCL_FUNCS3B(coll, copy)
// Must be consistent with the ncclFuncSet enum
static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
@@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
channel->collCount = 0;
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = comm->opCount;
NCCLCHECK(transportStartProxy(comm));
return ncclSuccess;
}
@@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
else if (info->coll == ncclCollAllReduce) {
if (info->nBytes <= info->comm->treeThreshold)
info->pattern = ncclPatternTreeUpDown;
else
info->pattern = ncclPatternRingTwice;
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
{ 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
};
static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
info->algorithm = -1;
info->protocol = -1;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float bw = comm->bandwidths[info->coll][a][p];
if (bw == 0) continue;
int logSize = log2i(info->nBytes>>6);
if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
if (time < minTime) {
info->algorithm = a;
info->protocol = p;
minTime = time;
}
}
}
else {
WARN("Unknown collective %d", info->coll);
if (info->algorithm == -1 || info->protocol == -1) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
}
//if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
int nc = comm->nChannels;
int nt = comm->maxThreads[info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
if (nc >= 2) nc--;
else if ((nt % 128) == 0) nt/=2;
else break;
}
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
info->nChannels = nc;
info->nThreads = nt;
return ncclSuccess;
}
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
switch (info->coll) {
case ncclCollBroadcast:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
case ncclCollReduce:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
case ncclCollReduceScatter:
case ncclCollAllGather:
info->pattern = ncclPatternRing; break;
case ncclCollAllReduce:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
return ncclInternalError;
}
return ncclSuccess;
}
@@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
return ncclSuccess;
}
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
// Compute thresholds and limits that users can override
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
// First compute nThreads
int nt = NCCL_LL_MIN_NTHREADS;
while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
// Then compute nChannels
int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
if (nc == 0) nc = 1;
if (nc > info->comm->nChannels) nc = info->comm->nChannels;
// Check if we have a fixed LL threshold, otherwise compute it.
int perThreadThreshold = info->comm->threadThreshold;
if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
ssize_t llThreshold = info->comm->llThreshold >= 0 ?
info->comm->llThreshold :
nc*nt*info->nchunksPerLoop*perThreadThreshold;
if (info->nBytes <= llThreshold) {
*llMode = 1;
*nChannels = nc;
*nThreads = nt;
} else {
*llMode = 0;
*nChannels = info->comm->nChannels;
*nThreads = info->comm->nThreads+1;
}
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
@@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
coll->args.ThisOutput = info->recvbuff;
coll->args.comm = info->comm->devComm;
coll->args.opCount = info->comm->opCount;
coll->args.nChannels = info->nChannels;
coll->args.nThreads = info->nThreads;
// Compute llMode, nChannels, nThreads
int llMode;
getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
int chunkSize = stepSize*chunkSteps;
// Compute lastChunkSize
if (treeMode == 1 && llMode == 0) {
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
if (info->pattern == ncclPatternTreeUpDown) {
// Optimize chunkSize / nSteps
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (llMode == 1) {
} else if (info->protocol == NCCL_PROTO_LL) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
int nstepsInter = 1+log2i(info->comm->nNodes);
while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
int chunkEffectiveSize = chunkSize;
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->llMode = llMode;
proxyArgs->protocol = info->protocol;
proxyArgs->opCount = info->comm->opCount;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
return ncclSuccess;
}
@@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
channel->collFifoTail = opIndex;
channel->collCount++;
}
/*if (llMode == 0)*/ info->comm->opCount++;
info->comm->opCount++;
return ncclSuccess;
}
@@ -0,0 +1,268 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "graph.h"
#include "trees.h"
#include "rings.h"
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nChannels = comm->nChannels;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->treeUp.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
channel->treeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0];
topoRanks->ringSend[c] = ringIntra[localRanks-1];
channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
}
if (treeIntra[i] == rank) {
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
// up/down go in reverse directions
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
// Down tree is common
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
channel->treeDn.up = treeIntra[prev];
channel->treeDn.down[0] = treeIntra[next];
// Up tree depends on the pattern
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
}
// Duplicate channels rings/trees
struct ncclChannel* channel0 = comm->channels;
struct ncclChannel* channel1 = channel0+nChannels;
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
return ncclSuccess;
}
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for (int c=0; c<nChannels; c++) {
int* recv = ringRecv+c*comm->nRanks;
int* send = ringSend+c*comm->nRanks;
int* prev = ringPrev+c*comm->nRanks;
int* next = ringNext+c*comm->nRanks;
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
for (int n=0; n<nNodes; n++) {
int recvRank = recv[firstRanks[n]];
int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
prev[recvRank] = prevSendRank;
if (comm->rank == recvRank) {
channel0->ring.prev = prevSendRank;
channel1->ring.prev = prevSendRank;
}
int sendRank = send[firstRanks[n]];
int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
next[sendRank] = nextRecvRank;
if (comm->rank == sendRank) {
channel0->ring.next = nextRecvRank;
channel1->ring.next = nextRecvRank;
}
}
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
}
return ncclSuccess;
}
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
return ncclSuccess;
}
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
if (u0 != -1) tree0->up = indexes[u0];
if (u1 != -1) tree1->up = indexes[u1];
return ncclSuccess;
}
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
int x = 0;
if (down[x] >= 0) x++;
if (down[x] >= 0) {
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
return ncclInternalError;
}
if (r0 != -1) down[x++] = indexes[r0];
if (r1 != -1) down[x++] = indexes[r1];
return ncclSuccess;
}
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
return ncclSuccess;
}
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
if (tree->down[0] == upRank) tree->down[0] = -1;
if (rank == upRank) tree->up = -1;
return ncclSuccess;
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
int* indexesSend, *indexesRecv;
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int u0, d0_0, d0_1, u1, d1_0, d1_1;
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
int root = indexesSend[node];
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
channel0->treeUp.depth = channel1->treeUp.depth = depth;
}
free(indexesSend);
free(indexesRecv);
return ncclSuccess;
}
// Legacy naming
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
// New naming
NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
int ncclMinNchannels() {
int minNchannels = 0;
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
if (minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS;
}
if (minNchannels < 0) minNchannels = 0;
return minNchannels;
}
int ncclMaxNchannels() {
int maxNchannels = MAXCHANNELS;
if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
maxNchannels = 1;
}
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
int nranks = comm->nRanks;
int nChannels = comm->nChannels;
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
for (int i=0; i<nranks; i++) {
for (int c=0; c<nChannels;c++) {
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
int c;
for (c=nChannels; c<ncclMinNchannels(); c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
}
nChannels = comm->nChannels = c;
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeUpRecv);
free(treeUpSend);
free(treeDnRecv);
free(treeDnSend);
return ncclSuccess;
}
+363
Fájl megtekintése
@@ -0,0 +1,363 @@
/*************************************************************************
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
struct ncclTopoNodeList {
struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
int count;
};
static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
for (int i=0; i<system->nodes[t].count; i++) {
if (system->nodes[t].nodes[i].id == id) {
*path = node->paths[t]+i;
return ncclSuccess;
}
}
WARN("Could not find node of type %d id %lx\n", t, id);
return ncclInternalError;
}
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
basePath->type = LINK_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
for (int n=0; n<nodeList.count; n++) {
struct ncclTopoNode* node = nodeList.list[n];
struct ncclTopoLinkList* path;
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
struct ncclTopoNode* remNode = link->remNode;
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
int width = std::min(path->width, link->width);
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links+l;
break;
}
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
return ncclInternalError;
}
// Copy the rest of the path
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
remPath->count = path->count + 1;
remPath->width = width;
// Consider the path is QPI when going through the CPU
// Also don't consider LINK_NET as we only care about the NIC->GPU path.
int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
// Disallow GPUs as intermediate steps for now
if (remNode->type != GPU) {
int i;
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
return ncclSuccess;
}
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
char line[1024];
#ifdef ENABLE_TRACE
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
#else
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
int offset = strlen(line);
#endif
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
if (node->paths[t] == NULL) continue;
for (int n = 0; n<system->nodes[t].count; n++) {
#ifdef ENABLE_TRACE
line[0] = 0;
int offset = 0;
for (int i=0; i<node->paths[t][n].count; i++) {
struct ncclTopoLink* link = node->paths[t][n].list[i];
struct ncclTopoNode* remNode = link->remNode;
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
#else
sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
offset = strlen(line);
#endif
}
}
#ifndef ENABLE_TRACE
INFO(NCCL_GRAPH, "%s", line);
#endif
}
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
for (int i=0; i<system->nodes[GPU].count; i++) {
printNodePaths(system, system->nodes[GPU].nodes+i);
}
for (int i=0; i<system->nodes[NET].count; i++) {
printNodePaths(system, system->nodes[NET].nodes+i);
}
return ncclSuccess;
}
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for (int c=0; c<system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if (minHops == 0 || hops < minHops) {
localCpu = c;
minHops = hops;
}
}
if (localCpu == -1) {
WARN("Error : could not find CPU close to GPU %d", gpu);
return ncclInternalError;
}
*retCpu = localCpu;
return ncclSuccess;
}
static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
int l=0;
// Node 1 -> CPU
for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
// CPU -> Node 2
for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
// Update path characteristics
srcNode->paths[t2][i2].count = l;
srcNode->paths[t2][i2].type = LINK_QPI;
srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
return ncclSuccess;
}
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
}
}
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
// Set direct paths from/to CPUs. We need them in many cases.
for (int c=0; c<system->nodes[CPU].count; c++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
}
// Set direct paths from/to GPUs.
for (int g=0; g<system->nodes[GPU].count; g++) {
// Compute paths to GPU g
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
if (peerInfos == NULL) continue;
// Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if (p2p == 0) {
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
if (shm == 1) {
// We cannot use GPU Direct, so we need all traffic to go through a CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
} else {
// We cannot communicate with that peer.
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
}
}
}
}
// Set direct paths from/to NICs.
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoSetPaths(netNode, system));
if (peerInfos == NULL) continue;
for (int g=0; g<system->nodes[GPU].count; g++) {
if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
// We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
// to go through a CPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
}
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
int *domains;
int64_t *ids;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
int myDomain = 0;
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
domains[g] = g;
ids[g] = gpu->id;
for (int p=0; p<g; p++) {
if (gpu->paths[GPU][p].count > 0) {
domains[g] = std::min(domains[g], domains[p]);
}
}
if (gpu->rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for (int i=0; i<ngpus; i++) {
if (domains[i] == myDomain) continue;
struct ncclTopoNode* gpu = NULL;
int g;
for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
gpu = system->nodes[GPU].nodes+g;
if (gpu->id == ids[i]) break; else gpu=NULL;
}
if (gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return ncclInternalError;
}
// Remove GPUs I can't access (even indirectly) from my view of the node
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
if (node == gpu) continue;
for (int l=0; l<node->nlinks; l++) {
while (l<node->nlinks && node->links[l].remNode == gpu) {
if (l<node->nlinks-1)
memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
node->nlinks--;
}
if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
node->links[l].remNode--;
}
}
}
}
if (g != system->nodes[GPU].count-1)
memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
system->nodes[GPU].count--;
}
comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks) {
// Trim network
ncclTopoRemovePathType(system, NET);
system->nodes[NET].count = 0;
}
free(domains);
free(ids);
return ncclSuccess;
}
static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
int nvlSpeed = 0;
int nvlPeers = 0;
int pciSpeed = 0;
for (int l=0; l<node->nlinks; l++) {
if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
}
*speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
return ncclSuccess;
}
ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
// Compute max speed to try to accelerate the search.
system->maxSpeed = LOC_WIDTH;
for (int g=0; g<system->nodes[GPU].count; g++) {
NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
}
if (system->nodes[NET].count) {
// Try to assign one NIC per GPU
int netMaxSpeed = 0;
int netMaxSpeedCount = 0;
for (int n=0; n<system->nodes[NET].count; n++) {
int maxSpeed = 0;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
for (int g=0; g<system->nodes[GPU].count; g++) {
maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
}
if (maxSpeed > netMaxSpeed) {
netMaxSpeed = maxSpeed;
netMaxSpeedCount = 1;
} else if (maxSpeed == netMaxSpeed) {
netMaxSpeedCount++;
}
}
system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
}
return ncclSuccess;
}
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
free(system);
}
@@ -0,0 +1,57 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(NCCL_INIT,"%s", line);
}
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
for (int i=0; i<nranks; i++) {
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
return ncclInternalError;
}
// Check that all ranks are there
for (int i=0; i<nranks; i++) {
int found = 0;
for (int j=0; j<nranks; j++) {
if (rings[r*nranks+j] == i) {
found = 1;
break;
}
}
if (found == 0) {
WARN("Error : ring %d does not contain rank %d", r, i);
return ncclInternalError;
}
}
}
return ncclSuccess;
}
@@ -0,0 +1,7 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
+594
Fájl megtekintése
@@ -0,0 +1,594 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
if (path->count == 0) return ncclSuccess;
*node = NULL;
if (width > 0) {
if (path->type > graph->type) return ncclSuccess;
graph->type = std::max(graph->type, path->type);
graph->nHops += path->count;
} else {
graph->type = typeSave;
graph->nHops -= path->count;
}
for (int i=0; i<path->count; i++) {
if (path->list[i]->width < width) {
// Can't follow this path, rewind and exit
for (int j=0; j<i; j++) path->list[j]->width += width;
return ncclSuccess;
}
path->list[i]->width -= width;
}
*node = path->list[path->count-1]->remNode;
return ncclSuccess;
}
static int gpuPciWidth(struct ncclTopoNode* gpu) {
for (int l=0; l<gpu->nlinks; l++) {
struct ncclTopoLink* gpuLink = gpu->links+l;
if (gpuLink->type != LINK_PCI) continue;
struct ncclTopoNode* pci = gpuLink->remNode;
for (int l=0; l<pci->nlinks; l++) {
struct ncclTopoLink* pciLink = pci->links+l;
if (pciLink->remNode != gpu) continue;
return std::min(gpuLink->width, pciLink->width);
}
}
return -1;
}
/* Choose the order in which we try next GPUs. This is critical for the search
to quickly converge to the best solution even if it eventually times out. */
struct ncclGpuScore {
int g; // Retain the index
int startIndex; // Least important
int intraNhops;
int intraWidth;
int interNhops;
int interPciWidth;
int interWidth; // Most important
};
static int cmpScore(const void * g1, const void * g2) {
struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
int d;
if ((d = (s2->interWidth - s1->interWidth))) return d;
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
if ((d = (s1->interNhops - s2->interNhops))) return d;
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
return s1->startIndex - s2->startIndex;
}
static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
int intraWidth = scores[0].intraWidth;
int intraNhops = scores[0].intraNhops;
for (int i=1; i<count; i++) {
if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
}
return 0;
}
static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
for (int n=0; n<system->nodes[NET].count; n++) {
if (system->nodes[NET].nodes[n].used & flag) {
*netPaths=system->nodes[NET].nodes[n].paths[GPU];
return ncclSuccess;
}
}
return ncclInternalError;
}
ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
const uint64_t flag = 1ULL<<(graph->nChannels);
int ngpus = system->nodes[GPU].count;
struct ncclTopoLinkList* paths = gpu->paths[GPU];
struct ncclTopoLinkList* netPaths = NULL;
if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
int start = gpu-system->nodes[GPU].nodes;
int count = 0;
for (int i=1; i<ngpus; i++) {
int g = (start+i)%ngpus;
if (paths[g].count == 0) continue; // There is no path to that GPU
if (system->nodes[GPU].nodes[g].used & flag) continue;
scores[count].g = g;
scores[count].startIndex = i;
scores[count].intraNhops = paths[g].count;
scores[count].intraWidth = paths[g].width;
if (netPaths) {
scores[count].interNhops = netPaths[g].count;
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
scores[count].interWidth = netPaths[g].width;
}
count++;
}
// Sort GPUs
qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore);
// Check if all have the same intra-node score in which case we go reverse for sortNet = -1
if (sortNet == -1 && cmpIntraScores(scores, count) == 0) {
for (int i=0; i<count; i++) next[i] = scores[count-1-i].g;
} else {
for (int i=0; i<count; i++) next[i] = scores[i].g;
}
*countPtr = count;
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2
ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
*g = -1;
if (graph->nChannels == 0) return ncclInternalError;
int ngpus = system->nodes[GPU].count;
int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
*g = i;
return ncclSuccess;
}
if (*g == -1) return ncclInternalError;
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
int typeSave = graph->type;
const uint64_t flag = 1ULL<<(graph->nChannels);
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
if (gpu) {
gpu->used ^= flag;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
gpu->used ^= flag;
if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
}
return ncclSuccess;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
// since it would likely impact the rings algorithms too.
if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
// 1. Try to get better bandwidth
if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
*copy = 1;
return ncclSuccess;
}
// 2. Give an advantage when all channels are the same
if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
*copy = 1;
return ncclSuccess;
}
// 3. Less hops
if (graph->nHops < refGraph->nHops) *copy = 1;
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
if ((*time) <= 0) return ncclSuccess;
(*time)--;
int ngpus = system->nodes[GPU].count;
if (step == ngpus) {
// Determine whether we found a better solution or not
int copy = 0;
int sameChannels = graph->sameChannels;
if (graph->nChannels > 0) {
int* intra = graph->intra+graph->nChannels*ngpus;
for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
}
graph->nChannels++;
NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
if (copy) {
memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
}
if (graph->nChannels < MAXCHANNELS/2) {
NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
}
graph->nChannels--;
graph->sameChannels = sameChannels;
return ncclSuccess;
}
graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
if (step == backToNet) {
// first get back to NIC
if (system->nodes[NET].count) {
int maxWidth = 0;
struct ncclTopoLinkList* paths = gpu->paths[NET];
for (int n=0; n<system->nodes[NET].count; n++) {
if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
maxWidth = std::max(paths[n].width, maxWidth);
}
for (int n=0; n<system->nodes[NET].count; n++) {
if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
if (paths[n].width == maxWidth) {
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
int typeSave = graph->type;
NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
if (net) {
graph->inter[graph->nChannels*2+1] = net->id;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
}
}
}
}
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
struct ncclTopoLinkList* paths = gpu->paths[GPU];
int next[NCCL_TOPO_MAX_NODES];
int count;
if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
next[0] = step+1;
count = 1;
} else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
count = 1;
} else { // Normal search
NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
}
for (int i=0; i<count; i++) {
int g = next[i];
int nvlink = graph->nvlink;
graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
int speed = graph->speedIntra;
if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
graph->nvlink = nvlink;
}
} else if (step == backToFirstRank) {
// Find first GPU and loop back to it
int g;
int rank = graph->intra[graph->nChannels*ngpus];
for (g=0; g<ngpus; g++) {
if (system->nodes[GPU].nodes[g].rank == rank) break;
}
if (g == ngpus) {
WARN("Could not find GPU with rank %d\n", rank);
return ncclInternalError;
}
struct ncclTopoLinkList* paths = gpu->paths[GPU];
struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
int typeSave = graph->type;
NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
if (firstGpu) {
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
}
} else {
// Next path
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
}
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
const uint64_t flag = 1ULL<<(graph->nChannels);
const int speed = graph->speedInter;
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoNode* gpu;
if (net->used == 0) {
graph->inter[graph->nChannels*2] = net->id;
for (int i=0; i<system->nodes[NET].count; i++) {
if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
}
struct ncclTopoLinkList* paths = net->paths[GPU];
// First try the PCI order to set a reference
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
// Then try to replay the last channel
if (graph->nChannels > 0) {
int g;
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
}
// Then try the most local GPUs
int maxWidth = 0, minHops = 0xfffffff;
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].width > maxWidth) {
maxWidth = paths[g].width;
minHops = paths[g].count;
} else if (paths[g].width == maxWidth && paths[g].count < minHops) {
minHops = paths[g].count;
}
}
if (maxWidth >= speed) {
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].width == maxWidth && paths[g].count == minHops) {
gpu = system->nodes[GPU].nodes+g;
int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
if (tryGpuBidir == gpuUsed) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
}
}
}
}
}
for (int i=0; i<system->nodes[NET].count; i++) {
if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
}
}
}
return ncclSuccess;
}
/* Search Patterns
*
* Intra-node
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
* (=Split Tree Loop)
* Tree : GPU a -> GPU b -> .. -> GPU x
* (=Split Tree)
*
* Inter-node
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
* `--> NET n (or m if crossNic)
*/
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
if (system->nodes[NET].count) {
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
else *backToNet = 1;
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
} else {
*backToNet = -1;
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
}
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
int backToNet, backToFirstRank;
NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
if (system->nodes[NET].count) {
// Start from NET
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
} else {
// Start from GPU 0
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
}
return ncclSuccess;
}
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
* Rings with a non-matching number of ranks are ignored so we can provide
* rings for multiple cases.
*/
#define MAX_ENV_RANKS 512
static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
int ranks[MAX_ENV_RANKS];
int nChannels = 0;
int rank = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number
do {
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
if (status == 0) {
ranks[rank] = digit;
status = 1;
} else {
ranks[rank] = ranks[rank]*10+digit;
}
} else {
if (status == 1) {
rank++;
if (rank == MAX_ENV_RANKS) goto end;
}
status = 0;
if (str[offset] == '|' || str[offset] == '\0') {
// Ignore if ngpus doesn't match
if (rank != ngpus) goto newchannel;
for (int r=0; r<ngpus; r++) {
int rank = ranks[r];
// Ignore if ranks are out of bounds
if (rank < 0 || rank >= ngpus) goto newchannel;
// Ignore if ranks are duplicate
for (int i=0; i<r; i++)
if (ranks[i] == rank) goto newchannel;
channels[nChannels*ngpus+r] = rank;
}
nChannels++;
newchannel:
rank = 0;
}
}
} while (str[offset++] != 0);
end:
*nChannelsRet = nChannels;
return ncclSuccess;
}
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
graph->speedIntra = graph->speedInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
graph->nvlink = 0;
graph->type = LINK_LOC;
graph->nChannels = 0;
graph->sameChannels = 1;
char* str = getenv("NCCL_GRAPH");
if (str) {
NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
for (int i=0; i<graph->nChannels*ngpus; i++) {
// Translate gpu numbers into ranks
graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
}
// TODO : let user specify NICs
graph->inter[0] = graph->inter[1] = 0;
graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
graph->nvlink = 0;
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
// Reverse the loop
for (int c=0; c<graph->nChannels; c++) {
for (int i=0; i<=ngpus/2; i++) {
int tmp = graph->intra[ngpus*c+i];
graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
graph->intra[ngpus*c+ngpus-i] = tmp;
}
}
}
if (graph->nChannels) return ncclSuccess;
}
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
int bestSpeed = 0;
// First try crossnic, then decrease speed and finally increase speedIntra.
tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
int maxSpeed = system->maxSpeed;
tmpGraph.pattern = graph->pattern;
search:
int time = NCCL_SEARCH_TIMEOUT;
tmpGraph.nvlink = 1;
tmpGraph.nChannels = 0;
tmpGraph.sameChannels = 1;
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
#if 0
printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("\n");
}
#endif
if (time == -1) goto done;
// We already have a solution and we timed out so lower speed will just timeout as well
if (time == 0 && graph->nChannels > 0) goto done;
if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
if (tmpGraph.speedIntra == tmpGraph.speedInter) {
// First pass, we don't have a solution yet ; try to go slower.
// Try a simpler tree
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
goto search;
}
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
goto search;
}
tmpGraph.pattern = graph->pattern;
if (tmpGraph.type < LINK_QPI) {
tmpGraph.type += 1;
goto search;
}
tmpGraph.type = graph->type;
if (crossNic && tmpGraph.crossNic == 0) {
// Try again with crossNic if permitted
tmpGraph.crossNic = crossNic;
goto search;
}
tmpGraph.crossNic = graph->crossNic;
// Try to reduce speed per channel
tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
}
done:
// We have a solution now. See if we can increase speedIntra
if (tmpGraph.speedIntra == tmpGraph.speedInter) {
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
}
if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
// Try to increase the intra speed only but keeping nChannels the same
tmpGraph.speedIntra += 3;
maxSpeed = tmpGraph.speedIntra * graph->nChannels;
if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
}
if (graph->nChannels == 0) {
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
graph->inter[0] = graph->inter[1] = 0;
graph->speedIntra = graph->speedInter = 3;
graph->nvlink = 0;
graph->nChannels = 1;
}
return ncclSuccess;
}
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
int ngpus = system->nodes[GPU].count;
char line[1024];
for (int c=0; c<graph->nChannels; c++) {
sprintf(line, "%2d :", c);
int offset = strlen(line);
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
offset = strlen(line);
}
for (int i=0; i<ngpus; i++) {
sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
offset = strlen(line);
}
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s", line);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
*dev = graph->inter[(channelId%graph->nChannels)*2+dir];
return ncclSuccess;
}
+641
Fájl megtekintése
@@ -0,0 +1,641 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "nvmlwrap.h"
#include "net.h"
#include <sys/stat.h>
#include <fcntl.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
static int getNumaId(char *path) {
char npath[PATH_MAX];
snprintf(npath, PATH_MAX, "%s/numa_node", path);
npath[PATH_MAX-1] = '\0';
int numaId = -1;
FILE *file = fopen(npath, "r");
if (file == NULL) return -1;
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
fclose(file);
return numaId;
}
static ncclResult_t getPciPath(char* busId, char** path) {
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
*path = realpath(busPath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", busPath);
return ncclSystemError;
}
return ncclSuccess;
}
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
char* str = path+offset;
// Remove trailing "/"
if (*str == '/') str--;
// Find next /
while (*str != '/') str--;
str++;
NCCLCHECK(busIdToInt64(str, id));
return ncclSuccess;
}
static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (system->nodes[GPU].nodes[i].id == id) {
*index = i;
}
}
return ncclSuccess;
}
static ncclResult_t getPath(int64_t id, char** path) {
char busId[] = "0000:00:00.0";
NCCLCHECK(int64ToBusId(id, busId));
NCCLCHECK(getPciPath(busId, path));
return ncclSuccess;
}
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
NCCLCHECK(getPciPath(busId, path));
return ncclSuccess;
}
int interCpuWidth = 0;
int cpuPciWidth = 0;
static ncclResult_t getCpuWidths() {
// Check if already detected
if (interCpuWidth + cpuPciWidth) return ncclSuccess;
// Defaults
char cpu[256];
sprintf(cpu, "Generic");
cpuPciWidth = interCpuWidth = PCI_WIDTH;
#ifdef __PPC__
sprintf(cpu, "ppc64");
interCpuWidth = P9_WIDTH;
#endif
#ifdef __x86_64__
sprintf(cpu, "x86_64");
union {
struct {
// CPUID 0 String register order
uint32_t ebx;
uint32_t edx;
uint32_t ecx;
};
char vendor[12];
} cpuid0;
asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
if (strcmp(cpu, "Intel") == 0) {
union {
struct {
int steppingId:4;
int model:4;
int familyId:4;
int processorType:2;
int resv0:2;
int extModelId:4;
int modelId:8;
int resv1:4;
};
uint32_t val;
} cpuid1;
asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
sprintf(cpu, "Intel/Skylake (or later)");
interCpuWidth = SKL_QPI_WIDTH;
} else {
interCpuWidth = QPI_WIDTH;
}
}
#endif
INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
return ncclSuccess;
}
static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
NCCLCHECK(getCpuWidths());
*width = interCpuWidth;
return ncclSuccess;
}
static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
NCCLCHECK(getCpuWidths());
*width = cpuPciWidth;
return ncclSuccess;
}
static ncclResult_t ncclTopoGetPciWidth(int* width) {
*width = PCI_WIDTH;
return ncclSuccess;
}
static ncclResult_t ncclTopoGetNetWidth(int* width) {
*width = NET_WIDTH;
return ncclSuccess;
}
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceUnknown,
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
if ((fd = open(rPath, O_RDONLY)) == -1) {
// Could not find device. It might be because we're in a VM and
// we don't see the whole machine. This is handled silently so
// we don't want to print an INFO error.
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
return ncclSystemError;
}
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
int len;
SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
SYSCHECK(close(fd), "close");
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
} else if (strcmp(pciClass, "0x068001") == 0) {
// PCI device is of type "Bridge: IBM Device 04ea"
*type = ncclNvLinkDeviceBridge;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;
} else {
*type = ncclNvLinkDeviceUnknown;
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
struct ncclTopoNode* cpuNode = NULL;
for (int c=0; c<system->nodes[CPU].count; c++) {
if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
}
if (cpuNode == NULL) { // Create CPU
NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
}
NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
return ncclSuccess;
}
ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
struct ncclTopoNode* nvsNode = NULL;
int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int cudaMajor, cudaMinor;
NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
int maxNvLinks, width;
if (cudaMajor < 6) {
maxNvLinks = 0;
width = 0;
} else if (cudaMajor == 6) {
maxNvLinks = 4;
width = PASCAL_NVLINK_WIDTH;
} else {
maxNvLinks = 6;
width = VOLTA_NVLINK_WIDTH;
}
int nvlinks = 0;
for (int l=0; l<maxNvLinks; ++l) {
// Check whether we can use this NVLink for P2P
unsigned canP2P;
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
// Make sure the Nvlink is up. The previous call should have trained the link.
nvmlEnableState_t isActive;
if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
// Try to figure out what's on the other side of the NVLink
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
char* p = remoteProc.busId;
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
lowerId[c] = tolower(p[c]);
if (p[c] == 0) break;
}
enum ncclNvLinkDeviceType type;
NCCLCHECK(ncclDeviceType(lowerId, &type));
if (type == ncclNvLinkDeviceGpu) {
int64_t remoteId;
NCCLCHECK(busIdToInt64(lowerId, &remoteId));
int peer;
NCCLCHECK(idToIndex(system, remoteId, &peer));
if (peer != -1) {
NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
nvlinks++;
}
} else if (type == ncclNvLinkDeviceBridge) {
// Nvlink between GPU and CPU (PPC)
// Since the remote bridge does not have a valid numa_node, assume we
// are connected to the closest CPU.
char* path;
NCCLCHECK(getPath(gpu->id, &path));
int numaId = getNumaId(path);
free(path);
NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
nvlinks++;
} else { // Nvswitch
if (type == ncclNvLinkDeviceUnknown) {
// The NVLink is up but we couldn't find the PCI device on the other
// side. Assume it's an NVswitch outside a VM.
if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
}
if (nvsNode == NULL) { // Create nvswitch
NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
}
NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
nvlinks++;
}
}
minNvlinks = std::min(minNvlinks, nvlinks);
minWidth = std::min(minWidth, width);
}
int pciWidth;
NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
system->maxWidth = minNvlinks ? minWidth : pciWidth;
return ncclSuccess;
}
ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
struct ncclTopoNode* lastNode = endNode;
int pciWidth;
NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
// Find intermediate PCI switches
int slashCount = 0;
int offsetRC = 0;
while (offsetRC < strlen(path)) {
if (path[offsetRC] == '/') slashCount++;
if (slashCount == 4) break;
offsetRC++;
}
int offset = strlen(path);
slashCount = 0;
while (--offset > offsetRC) {
if (path[offset] == '/') {
slashCount++;
// Find if already existing
if ((slashCount%2) == 0) {
int64_t pciId;
NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
for (int p=0; p<system->nodes[PCI].count; p++) {
if (system->nodes[PCI].nodes[p].id == pciId) {
// Found our PCI switch. Attach and stop since the rest should already
// be connected
NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
return ncclSuccess;
}
}
struct ncclTopoNode* pciNode;
NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
lastNode = pciNode;
}
}
}
// Then attach to a CPU node
int numaId = getNumaId(path);
int width;
NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
return ncclSuccess;
}
// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
#include <glob.h>
#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
uint64_t getIbGuid(char* path) {
uint64_t guid = 0ULL;
char guidPath[PATH_MAX];
snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
// PATH has a wildcard in it so use glob()
glob_t globbuf;
glob(guidPath, 0, NULL, &globbuf);
if (globbuf.gl_pathc > 0)
strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
globfree(&globbuf);
guidPath[PATH_MAX-1] = '\0';
FILE *file = fopen(guidPath, "r");
if (file != NULL) {
uint64_t a, b, c, d;
if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
guid = (a << 48) + (b << 32) + (c<<16) + d;
TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
}
fclose(file);
}
return guid;
}
struct netInfo {
char* path;
int64_t nic;
uint64_t asic;
int port;
int net;
};
ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
for (int n=0; n<ndev; n++) {
struct netInfo* info = netInfos+n;
uint64_t ibGuid;
info->nic = n;
info->asic = n;
info->port = 0;
info->net = n;
if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
info->asic = ibGuid;
// Ignore PCI subdevice when computing the ID to merge multi-port cards
// and make them use the same PCI link.
char* path = strdup(info->path);
path[strlen(path)-1]='0';
NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
free(path);
// Same PCI path -> different ports of the same NIC
for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
// Same GUID -> same network links as the other NIC
for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
}
INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
char* path;
NCCLCHECK(getPath(gpu->id, &path));
NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
free(path);
}
// Connect the NICs
int netDevCount;
NCCLCHECK(ncclNetDevices(&netDevCount));
int netWidth;
NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
struct netInfo* netInfos;
NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
if (res != ncclSuccess) netInfos[n].path = NULL;
}
NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
for (int n=0; n<netDevCount; n++) {
struct netInfo* info = netInfos+n;
// Create NIC and attach it to the PCI tree
struct ncclTopoNode* nicNode = NULL;
for (int i=0; i<system->nodes[NIC].count; i++) {
if (system->nodes[NIC].nodes[i].id == info->nic) {
nicNode = system->nodes[NIC].nodes+i;
break;
}
}
if (!nicNode) {
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
if (info->path) {
// Create the PCI path
NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
} else {
// This is probably a virtual NIC. Just attach it directly to CPU 0
int width;
NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
}
}
free(info->path);
// Create the network side
struct ncclTopoNode* netNode;
NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
// Use rank to store the net information
netNode->rank = info->net;
NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
}
free(netInfos);
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
for (int p=0; p<system->nodes[CPU].count; p++) {
if (n == p) continue;
int width;
NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
}
}
return ncclSuccess;
}
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
INFO(NCCL_GRAPH, "%s", line);
for (int i=0; i<offset; i++) line[i] = ' ';
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
if (link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
INFO(NCCL_GRAPH, "%s", line);
}
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
NCCLCHECK(ncclTopoPrintPaths(s));
return ncclSuccess;
}
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
// Shift all links to have upLink as last link
if (upNode) {
int l=0;
while (node->links[l].remNode != upNode) l++;
struct ncclTopoLink upLink;
memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
while (node->links[l+1].remNode) {
memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
l++;
}
memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
}
// Recursively sort the PCI tree
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
}
return ncclSuccess;
}
// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. QPI (already the case)
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclTopoSystem* s;
NCCLCHECK(ncclCalloc(&s, 1));
nvmlDevice_t* nvmlDevs;
int g = 0;
NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
for (int r=0; r<comm->nRanks; r++) {
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
// Consider the GPU as outside of our node if we can't see it through NVML.
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
g++;
struct ncclTopoNode* gpuNode;
NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
gpuNode->rank = r;
}
}
NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
NCCLCHECK(ncclTopoConnectPCI(s));
free(nvmlDevs);
NCCLCHECK(ncclTopoSortSystem(s));
*system = s;
return ncclSuccess;
}
ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
int g1, g2;
NCCLCHECK(idToIndex(system, busId1, &g1));
NCCLCHECK(idToIndex(system, busId2, &g2));
*nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
return ncclSuccess;
}
ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
int g;
NCCLCHECK(idToIndex(system, busId, &g));
for (int i=0; i<system->nodes[GPU].count; i++) {
if (i == g) continue;
if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
*nvlink = 1;
return ncclSuccess;
}
}
*nvlink = 0;
return ncclSuccess;
}
static int pathDistance(struct ncclTopoLinkList* links) {
int distance = PATH_PIX;
if (links->count > 2) distance = PATH_PXB;
for (int l=0; l<links->count; l++) {
// PHB if we go through 1 CPU, SYS if we go through 2 CPUs
if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
}
return distance;
}
ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
int g1, g2;
NCCLCHECK(idToIndex(system, busId1, &g1));
NCCLCHECK(idToIndex(system, busId2, &g2));
*distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
return ncclSuccess;
}
ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
int g;
NCCLCHECK(idToIndex(system, busId, &g));
*distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
return ncclSuccess;
}
ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[CPU].count;
return ncclSuccess;
}
+138
Fájl megtekintése
@@ -0,0 +1,138 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TOPO_H_
#define NCCL_TOPO_H_
#include "graph.h"
#include "core.h"
#define LOC_WIDTH 5000
#define PASCAL_NVLINK_WIDTH 18
#define VOLTA_NVLINK_WIDTH 21
#define PCI_WIDTH 12 // PCI Gen3 x16
#define QPI_WIDTH 8
#define SKL_QPI_WIDTH 12
#define P9_WIDTH 32
#define NET_WIDTH 12 // 100Gbit
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
// to GPU traffic consumed more PCI bandwidth.
#define INTEL_P2P(speed) (speed*9/12)
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
#define NCCL_TOPO_NODE_TYPES 6
#define GPU 0
#define PCI 1
#define NVS 2
#define CPU 3 // Actually NUMA domains
#define NIC 4
#define NET 5
extern const char* topoNodeTypeStr[];
#define LINK_LOC 0
#define LINK_NVL 1
#define LINK_PCI 2
#define LINK_QPI 3
#define LINK_NET 4
extern const char* topoLinkTypeStr[];
struct ncclTopoNode;
struct ncclTopoLink {
int type;
int width;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 32
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
#define SELECT_PATH 1
#define SELECT_LAST 2
#define NET_GDR_MASK 0x70000000
struct ncclTopoLinkList {
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
int count;
int width;
int type;
};
struct ncclTopoNode {
int type;
int64_t id;
int rank;
int nlinks;
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
// Pre-computed paths to GPUs and NICs
struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
// Used during search
uint64_t used;
};
struct ncclTopoNodeSet {
int count;
struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
};
struct ncclTopoSystem {
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
int maxSpeed;
int maxWidth;
int searchInitDone;
};
static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes+i;
return ncclSuccess;
}
}
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d\n", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if (type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks=1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].width = LOC_WIDTH;
}
*node = n;
return ncclSuccess;
}
static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
// Aggregate links into higher width for NVLink
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
link->width += width;
// Sort links in BW descending order
struct ncclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
while (link != node->links) {
if ((link-1)->width >= linkSave.width) break;
memcpy(link, link-1, sizeof(struct ncclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
#endif
@@ -4,9 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "net.h"
#include "param.h"
#include "nccl.h"
#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
+212
Fájl megtekintése
@@ -0,0 +1,212 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "comm.h"
#include "topo.h"
NCCL_PARAM(Nthreads, "NTHREADS", -2);
NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
static int getNthreads(const char* name, int env, int min, int max, int def) {
int nt = env;
if (nt > 0) {
if (nt % WARP_SIZE != 0) {
WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
nt = max;
} else if (nt > max) {
WARN("Invalid %s %d (maximum %d).", name, nt, max);
nt = max;
} else if (nt < min) {
WARN("Invalid %s %d (minimum %d).", name, nt, min);
nt = min;
}
} else {
nt = def;
}
return nt;
}
ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
int def, set;
if (str[0] == '^') {
def = 1; set = 0; str++;
} else {
def = 0; set = 1;
}
for (int i=0; i<nelems; i++) list[i] = def;
char* tokStr = strdup(str);
char* tmpStr;
char* token = strtok_r(tokStr, ",", &tmpStr);
while (token) {
for (int i=0; i<nelems; i++)
if (strcasecmp(token, elems[i]) == 0) list[i] = set;
token = strtok_r(NULL, ",", &tmpStr);
}
free(tokStr);
return ncclSuccess;
}
static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
static const char* ncclAlgoStr[] = { "Tree", "Ring" };
static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } };
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
#define NCCL_HW_PCI 1
#define NCCL_HW_NET 2
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } }
};
// LL128 max BW for the different collectives
static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
if (comm->nRanks <= 1) return ncclSuccess;
struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
int intraHw[2], hw[2];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
comm->nRanks;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed * 1.0;
// Various model refinements
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0;
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
// Convert bus BW to algorithm BW
float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
if (ringGraph->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
comm->latencies[coll][a][p] += nsteps*lat;
}
} else {
comm->latencies[coll][a][p] += nsteps*lat;
}
} else {
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] +=
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
}
}
}
}
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
const char *algoStr = getenv("NCCL_ALGO");
if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int pEnable = protoEnable[p];
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
// Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
}
if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
char line[1024];
int offset = 0;
sprintf(line, "Latency/AlgBw |");
offset = strlen(line);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
offset = strlen(line);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
offset = strlen(line);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
offset = strlen(line);
}
}
INFO(NCCL_TUNING, "%s", line);
}
}
// Set per-thread amount of work before we increase nThreads and nChannels
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
if (str) {
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { -2 };
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
}
}
}
INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
return ncclSuccess;
}
@@ -51,11 +51,6 @@ struct ncclAsyncArgs {
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
ncclResult_t ncclSetDevice(int cudaDev) {
CUDACHECK(cudaSetDevice(cudaDev));
return ncclSuccess;
}
#define CHECK(a) do { \
if ((args->ret = (a)) != ncclSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
@@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) {
void* ncclAsyncThreadMain(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
CHECK(ncclSetDevice(args->init.cudaDev));
CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
return args;
}
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
return ncclAsyncErrCheck(ncclInternalError);
return ncclAsyncErrCheck(ncclInvalidUsage);
}
int index = ncclGroupIndex++;
struct ncclAsyncArgs* args = ncclGroupArgs+index;
@@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm
args->init.ndev = ndev;
memcpy(&args->init.commId, &commId, sizeof(commId));
args->init.myrank = myrank;
// We need to use threads for Init
pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
return ncclSuccess;
}
@@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
}
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
return ncclAsyncErrCheck(ncclInternalError);
return ncclAsyncErrCheck(ncclInvalidUsage);
}
ncclGroupIndex++;
args->funcType = ASYNC_FUNC_COLL;
@@ -124,6 +116,14 @@ ncclResult_t ncclGroupEnd() {
ncclResult_t ret = ncclGroupError;
if (ret != ncclSuccess) goto group_cleanup;
/* Launch async ncclCommInitRank */
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
}
}
/* Collectives are done in three steps :
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
* 2. Barrier Wait. No CUDA call is permitted
@@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() {
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
if (err == EBUSY) continue;
if (err != 0) { ret = ncclSystemError; goto end; }
if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
if (err != 0) ret = ncclSystemError;
if (args->ret != ncclSuccess) ret = args->ret;
doneArray[i] = 1;
done--;
}
@@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() {
}
goto end;
group_cleanup:
// At least one call in the group failed. Since we want to make that group
// an atomic operation, we need to cancel all operations.
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
if (ret != ncclSuccess) {
// At least one call in the group failed. Since we want to make that group
// an atomic operation, we need to cancel all operations.
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
*args->init.newcomm = NULL;
} else {
struct ncclComm* comm = args->coll.comm;
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
}
channel->collFifoTail = channel->collStart;
channel->collCount = 0;
}
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs *op, *start;
pthread_mutex_lock(&state->mutex);
op = start = state->ops;
while (op) {
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
struct ncclProxyArgs* peerOp = op->nextPeer;
while (peerOp) {
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
peerOp = peerOp->nextPeer;
}
op = op->next;
if (op == start) break;
}
comm->opCount = comm->lastOpCount;
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->mutex);
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
}
channel->collFifoTail = channel->collStart;
channel->collCount = 0;
}
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
}
end:
ncclGroupError = ncclSuccess;
@@ -8,6 +8,7 @@
#define NCCL_ARGCHECK_H_
#include "core.h"
#include "info.h"
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
ncclResult_t ArgsCheck(struct ncclInfo* info);
@@ -17,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
@@ -6,7 +6,7 @@
#ifndef NCCL_CHANNEL_H_
#define NCCL_CHANNEL_H_
#include "core.h"
#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
@@ -7,7 +7,10 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
#include "core.h"
#include "info.h"
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -22,7 +25,8 @@
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype)
DECL_COLL5(coll##LL, op, dtype) \
DECL_COLL5(coll##LL128, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
+22 -8
Fájl megtekintése
@@ -7,6 +7,8 @@
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#include "transport.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
@@ -18,13 +20,17 @@ struct cudaLaunchParams {
};
#endif
#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
// Channels / LL tuning
#define NCCL_LL_THREAD_THRESHOLD 8
#define NCCL_LL128_THREAD_THRESHOLD 8
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
struct ncclSendMem {
union {
struct {
@@ -50,6 +56,7 @@ struct ncclRecvMem {
char pad4[MEM_ALIGN];
};
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
char buff[1]; // Actually larger than that
};
@@ -57,13 +64,18 @@ struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int nvmlDev; // my NVML device number
int64_t busId; // my PCI bus ID in int format
int node;
int nNodes;
int localRanks;
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
@@ -74,17 +86,19 @@ struct ncclComm {
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
uint64_t lastOpCount;
// Channels for collectives
int nChannels;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// Only nvlink is used for inter-GPU communication
int nvlink;
// Tree algorithm threshold
ssize_t treeThreshold;
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
+21 -15
Fájl megtekintése
@@ -8,19 +8,11 @@
#define NCCL_CORE_H_
#include <pthread.h>
#include <algorithm>
#include "nccl.h"
#include "debug.h"
#include "checks.h"
#include "alloc.h"
#include "transport.h"
#include "devcomm.h"
#include "comm.h"
#include "info.h"
#include "argcheck.h"
#include <cstdio>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm> // For std::min/std::max
#include "nccl.h"
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@@ -38,10 +30,6 @@
ret func(args)
#endif // end PROFAPI
int ncclCudaCompCap();
ncclResult_t ncclNvlinkGpu(int* nvlink);
int64_t ncclTreeThreshold();
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
@@ -62,4 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
#define NCCL_NUM_FUNCTIONS 5
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
#include "debug.h"
#include "checks.h"
#include "alloc.h"
#include "utils.h"
#include "param.h"
#endif // end include guard
+11 -102
Fájl megtekintése
@@ -7,15 +7,14 @@
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
#include <pthread.h>
#include "core.h"
#include <stdio.h>
#include <chrono>
#include <unistd.h>
#include <sys/syscall.h>
#include <limits.h>
#include <string.h>
#include "nccl.h"
#include "nccl_net.h"
#define gettid() (pid_t) syscall(SYS_gettid)
@@ -25,9 +24,16 @@ extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
#define NOWARN(a, ret) do { \
ncclDebugNoWarn = 1; \
ret = a; \
ncclDebugNoWarn = 0; \
} while (0)
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
#define TRACE(...)
#endif
#include <stdlib.h>
static inline void initDebug() {
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
if (nccl_debug_subsys != NULL) {
char *subsys = strtok(nccl_debug_subsys, ",");
while (subsys != NULL) {
int invert = 0;
uint64_t mask = 0;
if (subsys[0] == '^') { invert = 1; subsys++; }
if (strcasecmp(subsys, "INIT") == 0) {
mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NCCL_NET;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
}
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
int c = 0;
char debug_fn[PATH_MAX+1] = "";
char *dfn = debug_fn;
while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
if (nccl_debug_file[c++] != '%') {
*dfn++ = nccl_debug_file[c-1];
continue;
}
switch (nccl_debug_file[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = nccl_debug_file[c-1];
break;
}
}
*dfn = '\0';
if (debug_fn[0] != '\0') {
FILE *file = fopen(debug_fn, "w");
if (file != NULL) {
INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
ncclDebugFile = file;
}
}
}
pthread_mutex_init(&ncclDebugOutputLock, NULL);
#ifdef ENABLE_TRACE
ncclEpoch = std::chrono::high_resolution_clock::now();
#endif
}
#endif
+33 -13
Fájl megtekintése
@@ -13,8 +13,6 @@
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
@@ -38,16 +36,18 @@ union ncclLLFifoLine {
int4 i4;
};
#define MAXTHREADS 256
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
#define NUM_LINES_PER_THREAD 8
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
#define WARP_SIZE 32
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 512
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#ifdef DEBUG_LL
#define NCCL_LL_CLEAN_MASK 0x00000ff8
#define NCCL_LL_FLAG_MAX 0x00001000
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
#ifdef TEST_LL_CLEANUP
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define NCCL_LL_FLAG_MAX 0x100
#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
#else
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
@@ -55,6 +55,24 @@ union ncclLLFifoLine {
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
#define NCCL_LL128_MAX_NTHREADS 640
#define NCCL_LL128_ELEMS_PER_THREAD 120
// Receiving from up to 3 sources is more compute intensive than sending
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
@@ -73,6 +91,9 @@ struct ncclConnInfo {
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
// High bandwidth, low latency protocol
uint64_t* ll128Buff; // Local for recv, remote for send
};
struct ncclConnector {
@@ -148,7 +169,8 @@ struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree treeUp;
struct ncclTree treeDn;
int id;
int nthreads;
@@ -171,8 +193,6 @@ struct ncclChannel {
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#define MAXCHANNELS 16
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
@@ -7,14 +7,9 @@
#ifndef NCCL_ENQUEUE_H_
#define NCCL_ENQUEUE_H_
#include "core.h"
#include "comm.h"
#include "group.h"
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MIN_NTHREADS 64
#include "collectives.h"
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
@@ -0,0 +1,94 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GRAPH_H_
#define NCCL_GRAPH_H_
#include "nccl.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
enum ncclPathDist {
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_NODE = 3,
PATH_SYS = 4,
PATH_ARRAY_SIZE = 5
};
extern const char* pathDists[PATH_ARRAY_SIZE];
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
struct ncclTopoSystem;
// Build the topology
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
// Query topology
ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
#define NCCL_TOPO_MAX_NODES 256
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
#define NCCL_TOPO_PATTERN_RING 4 // Ring
struct ncclTopoGraph {
// Input / output
int pattern;
int crossNic;
// Output
int nChannels;
int speedIntra;
int speedInter;
int type;
int nvlink;
int sameChannels;
int nHops;
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
int inter[MAXCHANNELS*2];
};
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
struct ncclTopoRanks {
int ringRecv[MAXCHANNELS];
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeUpRecv[MAXCHANNELS];
int treeUpSend[MAXCHANNELS];
int treeDnRecv[MAXCHANNELS];
int treeDnSend[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
struct ncclTopoRanks** allTopoRanks, int* rings);
ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
#endif
+3 -3
Fájl megtekintése
@@ -8,14 +8,14 @@
#define NCCL_GROUP_H_
#include "nccl.h"
#include "core.h"
#include "comm.h"
bool ncclAsyncMode();
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+6 -1
Fájl megtekintése
@@ -8,6 +8,7 @@
#define NCCL_INFO_H_
#include "nccl.h"
#include "core.h"
typedef enum {
ncclPatternRing,
@@ -21,7 +22,7 @@ typedef enum {
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclColl_t coll;
ncclFunc_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
@@ -36,7 +37,11 @@ struct ncclInfo {
int chunkSteps;
int sliceSteps;
// Computed later
int algorithm;
int protocol;
ncclPattern_t pattern;
int nChannels;
int nThreads;
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
@@ -15,7 +15,7 @@
#define NCCL_PTR_CUDA 0x2
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+31 -1
Fájl megtekintése
@@ -17,7 +17,6 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -31,6 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
#define GPU_BUF_SIZE (2*1024*1024)
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
int support;
NCCLCHECK(ncclNet->ptrSupport(dev, &support));
*supportedTypes = support & ~NCCL_PTR_CUDA;
// The network supports GPU Direct RDMA ; verify the GPU supports it as well.
if (support & NCCL_PTR_CUDA) {
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
ncclResult_t res;
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
if (res != ncclSuccess) goto cleanup;
NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
*supportedTypes |= NCCL_PTR_CUDA;
cleanup:
if (gpuPtr) cudaFree(gpuPtr);
if (rComm) ncclNetCloseRecv(rComm);
if (sComm) ncclNetCloseSend(sComm);
if (lComm) ncclNetCloseListen(lComm);
}
return ncclSuccess;
}
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
@@ -1,133 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVLINK_H_
#define NCCL_NVLINK_H_
#include <sys/stat.h>
#include <fcntl.h>
#include "nvmlwrap.h"
#include "topo.h"
#define CONNECT_NVLINK 0x10
#define CONNECT_NVSWITCH 0x100
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
if ((fd = open(rPath, O_RDONLY)) == -1) {
// Could not find device. It might be because we're in a VM and
// we don't see the whole machine. This is handled silently so
// we don't want to print an INFO error.
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
return ncclSystemError;
}
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
int len;
SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
SYSCHECK(close(fd), "close");
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
} else if (strcmp(pciClass, "0x068001") == 0) {
// PCI device is of type "Bridge: IBM Device 04ea"
*type = ncclNvLinkDeviceBridge;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;
} else {
// Ignore if we don't know what's on the other side.
return ncclSystemError;
}
return ncclSuccess;
}
/* Get the maximum number of NVLinks based on the GPU generation */
static ncclResult_t getMaxNvlinks(int* maxLinks) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
int ccMajor;
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
// 6 for Volta, 4 for Pascal
*maxLinks = (ccMajor > 6) ? 6 : 4;
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
return ncclSuccess;
}
static int getNvlinkGpu(const char* busId1, const char* busId2) {
// Determine if that connection is through NVLink
int links = 0;
int nvswitch_links = 0;
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
nvmlDevice_t nvmlDev;
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
if (res != ncclSuccess) return 0;
for(int l=0; l<maxNvLinks; ++l) {
// Check whether we can use this NVLink for P2P
unsigned canP2P;
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
// Make sure the Nvlink is up. The previous call should have trained the link.
nvmlEnableState_t isActive;
if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
// Try to figure out what's on the other side of the NVLink
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
// Old versions of NVML return a lowercase PCI ID
char* p = remoteProc.busId;
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
p[c] = toupper(p[c]);
}
if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
links++;
} else {
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
char* p = remoteProc.busId;
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
lowerId[c] = tolower(p[c]);
}
// Determine if the remote side is NVswitch or a GPU
enum ncclNvLinkDeviceType type;
ncclResult_t ret = ncclDeviceType(lowerId, &type);
if (ret == ncclSuccess) {
if (type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
nvswitch_links++;
} else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
links++;
}
} else {
// The NVLink is up but we couldn't find the PCI device on the other
// side. Assume it's an NVswitch outside a VM.
if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
nvswitch_links++;
}
}
}
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
}
#endif
+22 -4
Fájl megtekintése
@@ -9,18 +9,31 @@
#include "nccl.h"
//#define NVML_DIRECT 1
#ifdef NVML_DIRECT
#include "nvml.h"
// The NVML library doesn't appear to be thread safe
#include <pthread.h>
extern pthread_mutex_t nvmlLock;
#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
#define NVMLLOCKCALL(cmd, ret) do { \
NVMLLOCK(); \
ret = cmd; \
NVMLUNLOCK(); \
} while(false)
#define NVMLCHECK(cmd) do { \
nvmlReturn_t e = cmd; \
nvmlReturn_t e; \
NVMLLOCKCALL(cmd, e); \
if( e != NVML_SUCCESS ) { \
WARN("NVML failure '%s'", nvmlErrorString(e)); \
return ncclSystemError; \
} \
} while(false)
//#define NVML_DIRECT 1
#ifdef NVML_DIRECT
#include "nvml.h"
static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
@@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
return ncclSuccess;
}
#else
// Dynamically handle dependencies on NVML
@@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
#endif // NVML_DIRECT
@@ -1,17 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_RINGS_H_
#define NCCL_RINGS_H_
static int getDefaultThreads() {
// On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
}
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
#endif
+11 -9
Fájl megtekintése
@@ -66,7 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
if (searchNot) prefixList++;
bool searchExact = prefixList && prefixList[0] == '=';
if (searchExact) prefixList++;
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
int found = 0;
@@ -118,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
return found;
}
static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
/* Check family first */
int family = local_if.ifa_addr->sa_family;
if (family != remote.sa.sa_family) {
if (family != remote->sa.sa_family) {
return false;
}
if (family == AF_INET) {
struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
struct sockaddr_in& remote_addr = remote.sin;
struct sockaddr_in& remote_addr = remote->sin;
struct in_addr local_subnet, remote_subnet;
local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
@@ -136,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
} else if (family == AF_INET6) {
struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
struct sockaddr_in6& remote_addr = remote.sin6;
struct sockaddr_in6& remote_addr = remote->sin6;
struct in6_addr& local_in6 = local_addr->sin6_addr;
struct in6_addr& mask_in6 = mask->sin6_addr;
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
@@ -161,7 +163,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
}
}
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
#endif
@@ -189,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
// Store the interface name
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a));
found++;
if (found == maxIfs) break;
}
if (found == 0) {
WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a));
}
freeifaddrs(interfaces);
return found;
@@ -300,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
// Try to find interface that is in the same subnet as the IP in comm id
union socketAddress idAddr;
GetSocketAddrFromString(&idAddr, commId);
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
}
}
// Then look for anything else (but not docker or lo)
@@ -387,7 +389,7 @@ retry:
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
goto retry;
}
@@ -1,45 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TOPO_H_
#define NCCL_TOPO_H_
#include "nccl.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
ncclResult_t getCudaPath(int cudaDev, char** path);
static int getNumaId(char *path) {
char npath[PATH_MAX];
snprintf(npath, PATH_MAX, "%s/numa_node", path);
npath[PATH_MAX-1] = '\0';
int numaId = -1;
FILE *file = fopen(npath, "r");
if (file == NULL) return -1;
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
fclose(file);
return numaId;
}
enum ncclPathDist {
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_NODE = 3,
PATH_SYS = 4,
PATH_ARRAY_SIZE = 5
};
extern const char* pathDists[PATH_ARRAY_SIZE];
int pciDistance(char* path1, char* path2);
#endif
+11 -11
Fájl megtekintése
@@ -7,12 +7,15 @@
#ifndef NCCL_TRANSPORT_H_
#define NCCL_TRANSPORT_H_
#include "nccl.h"
#include "devcomm.h"
#include <stdint.h>
#include "graph.h"
#include "nvmlwrap.h"
#include "core.h"
#define NTRANSPORTS 3
#define TRANSPORT_P2P 0
#define TRANSPORT_SHM 1
#define TRANSPORT_NET 2
extern struct ncclTransport ncclTransports[];
@@ -24,15 +27,13 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
int nvmlDev;
int gdrSupport;
uint64_t hostHash;
uint64_t pidHash;
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
dev_t shmDev;
int64_t busId;
};
// Used to hold the transport connection values
typedef int64_t ncclTvalue_t;
#define CONNECT_SIZE 128
struct ncclConnect {
char data[CONNECT_SIZE];
@@ -51,7 +52,7 @@ struct ncclProxyArgs {
int chunkSteps;
int nsteps;
uint64_t opCount;
int llMode;
int protocol;
int state; // add component before this line -- it is left out during initialization
// Internal state
@@ -78,7 +79,7 @@ struct ncclProxyState {
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -86,8 +87,7 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[4];
ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
};
@@ -10,6 +10,14 @@
#include "nccl.h"
#include <stdint.h>
int ncclCudaCompCap();
// PCI Bus ID <-> int64 conversion functions
ncclResult_t int64ToBusId(int64_t id, char* busId);
ncclResult_t busIdToInt64(char* busId, int64_t* id);
ncclResult_t getBusId(int cudaDev, int64_t *busId);
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
@@ -23,4 +31,10 @@ struct netIf {
int parseStringList(const char* string, struct netIf* ifList, int maxList);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
static long log2i(long n) {
long l = 0;
while (n>>=1) l++;
return l;
}
#endif
A különbségek nem kerülnek megjelenítésre, mivel a fájl túl nagy Load Diff
@@ -5,6 +5,7 @@
************************************************************************/
#include "argcheck.h"
#include "comm.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
+51 -7
Fájl megtekintése
@@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
static nvmlReturn_t (*nvmlInternalShutdown)(void);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
@@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
// Used to make the NVML library calls thread safe
pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t wrapNvmlSymbols(void) {
if (nvmlState == nvmlInitialized)
@@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) {
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
nvmlState = nvmlInitialized;
return ncclSuccess;
@@ -85,6 +91,7 @@ teardown:
nvmlInternalShutdown = NULL;
nvmlInternalDeviceGetHandleByPciBusId = NULL;
nvmlInternalDeviceGetIndex = NULL;
nvmlInternalDeviceGetHandleByIndex = NULL;
nvmlInternalDeviceGetPciInfo = NULL;
nvmlInternalDeviceGetMinorNumber = NULL;
nvmlInternalDeviceGetNvLinkState = NULL;
@@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
nvmlInternalErrorString(ret));
@@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetIndex() failed: %s ",
nvmlInternalErrorString(ret));
@@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
if (nvmlInternalDeviceGetHandleByIndex == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
if (nvmlInternalDeviceGetPciInfo == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetPciInfo() failed: %s ",
nvmlInternalErrorString(ret));
@@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetMinorNumber() failed: %s ",
nvmlInternalErrorString(ret));
@@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
@@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
@@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
@@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
#endif
-391
Fájl megtekintése
@@ -1,391 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "param.h"
#define NCCL_MAX_SCORE 7
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
* Rings with a non-matching number of ranks are ignored so we can provide
* rings for multiple cases.
*/
#define MAX_ENV_RANKS 512
static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
int ranks[MAX_ENV_RANKS];
int nrings = 0;
int rank = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number
do {
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
if (status == 0) {
ranks[rank] = digit;
status = 1;
} else {
ranks[rank] = ranks[rank]*10+digit;
}
} else {
if (status == 1) {
rank++;
if (rank == MAX_ENV_RANKS) goto end;
}
status = 0;
if (str[offset] == '|' || str[offset] == '\0') {
int prevRank = ranks[rank-1];
// Ignore rings if nranks doesn't match
if (rank != nranks) goto newring;
for (int r=0; r<nranks; r++) {
int rank = ranks[r];
// Ignore rings with ranks out of bounds
if (rank < 0 || rank >= nranks) goto newring;
// Ignore rings with duplicate ranks
for (int i=0; i<r; i++)
if (ranks[i] == rank) goto newring;
next[nrings*nranks+prevRank] = rank;
prev[nrings*nranks+rank] = prevRank;
prevRank = rank;
}
nrings++;
newring:
rank = 0;
}
}
} while (str[offset++] != 0);
end:
*nringsRet = nrings;
return ncclSuccess;
}
/*
* Ring creation algorithm
*
* First, we establish hierarchical coordinates depending on the way ranks can
* communicate. After fillCoords, we have for each rank a unique 3-int array
* { node, pci_domain, rank } corresponding to the three transports :
* { 2[NET], 1[SHM], 0[P2P] }.
* Also, we renumber ranks (to indexes) based on their growing coordinates.
*
* Then, we ask transports to connect groups together. We start with net, then
* shm, then p2p. We maintain two arrays, prev and next, where values are equal
* to -1 when ranks are not yet connected, and a rank otherwise. We never
* connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
* ranks, if we are rank 13, we should see something like (provided we have a
* single net interface, hence a single ring) :
*
* Connecting all nodes <13>
* 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Connecting P2P domains with shared memory <13>
* 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Connecting ranks (only inside the P2P domain) <13>
* 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
*
* Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
* which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
* risking to explode in terms of combinations, and we scale better.
*
* Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
* we get at least one ring.
*/
static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
connected[rank] = 1;
for (int r=0; r<nranks; r++) {
if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
recIsConnected(r, connected, nranks, matrix, transport);
}
}
}
static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
for (int r=0; r<nranks; r++) connected[r] = 0;
recIsConnected(rank, connected, nranks, matrix, transport);
}
#define NEW_IDX(rank) do { \
rankToIdx[rank] = idx; \
idxToRank[idx] = rank; \
for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
idx++; \
} while (0)
int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
for (int r=0; r<nranks; r++) {
if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
}
return -1;
}
static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
int current[NTRANSPORTS];
int* p2pConnected;
NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
int curRank = 0, idx = 0;
while (1) {
// P2P is handled separately as there is no level below it and we need to
// cover the case of being connected to another GPU indirectly.
// So we detect all GPUs in the same P2P domain once and add them all at
// once.
isConnected(curRank, p2pConnected, nranks, matrix, 0);
for (int r=0; r<nranks; r++) {
if (p2pConnected[r]) {
NEW_IDX(r);
curRank = r;
current[0]++;
}
}
current[0] = 0;
if (idx == nranks) {
free(p2pConnected);
return ncclSuccess;
}
// Find next group, either connected through SHM or NET.
int rank;
int transport = 1;
while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
current[transport] = 0;
transport++;
if (transport == NTRANSPORTS) {
WARN("Error : Could not find transport to connect next group\n");
free(p2pConnected);
return ncclInternalError; }
}
curRank = rank;
current[transport]++;
}
}
#ifdef __PPC__
// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
#define DEFAULT_MIN_NRINGS 4
#else
#define DEFAULT_MIN_NRINGS 0
#endif
NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
/* Users can force the number of threads with an environment variable */
NCCL_PARAM(Nthreads, "NTHREADS", -2);
ncclResult_t getEnvThreads(int* nthreads) {
int64_t nt = ncclParamNthreads();
if (nt != -2)
*nthreads = nt;
return ncclSuccess;
}
static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
for (int r=nrings; r<newNrings; r++) {
for (int i=0; i<nranks; i++) {
a[r*nranks+i] = a[(r-nrings)*nranks+i];
b[r*nranks+i] = b[(r-nrings)*nranks+i];
c[r*nranks+i] = c[(r-nrings)*nranks+i];
d[r*nranks+i] = d[(r-nrings)*nranks+i];
}
}
return newNrings;
}
/* Main ring creation function */
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
*nrings = 0;
if (nranks == 1) return ncclSuccess;
char* str = getenv("NCCL_RINGS");
if (str && strlen(str)>0) {
int ret = parseRings(str, nrings, nranks, prev, next);
if (ret == ncclSuccess && *nrings > 0) {
if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
NCCLCHECK(getEnvThreads(nthreads));
for (int r = 0; r<*nrings; r++) {
for (int i = 0; i<nranks; i++) {
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
}
}
return ncclSuccess;
}
if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
*nrings = 0;
}
// Compute hierarchical topology groups, indexes, and rank<->index tables
int* coords, *globalIdxToRank, *globalRankToIdx;
NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
// Start with a high score, then decrease until we find rings
int minScore = NCCL_MAX_SCORE;
int nringsTmp;
int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&idxToRank, nranks));
NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
NCCLCHECK(ncclCalloc(&groups, nranks));
NCCLCHECK(ncclCalloc(&subgroups, nranks));
int nThreads;
do {
nThreads = *nthreads;
for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
nringsTmp = MAXCHANNELS;
// Loop over transports to connect groups
for (int t=NTRANSPORTS-1; t>=0; t--) {
for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
int nidx = 0;
for (int i=0; i<nranks; i++) {
// Extract only ranks in the same local area as rank
// We need to extract them in the topological order, hence we iterate over indexes, not ranks
int r = globalIdxToRank[i];
int sameLocal = 1;
for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
if (!sameLocal) continue;
groups[nidx] = coords[r*NTRANSPORTS+t];
subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
rankToIdx[r] = nidx;
idxToRank[nidx] = r;
nidx++;
}
int ngroups = groups[nidx-1] + 1; // Coords should be ordered
ncclTvalue_t* subvalues;
int *subprev, *subnext;
NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
if (ngroups > 1) {
/* Extract subvalues */
for (int i=0; i<nidx; i++) {
for (int j=0; j<nidx; j++) {
if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
else
subvalues[i*nidx+j] = 0;
}
}
/* Extract subprev/subnext */
for (int i=0; i<nidx*nringsTmp; i++) {
subprev[i] = subnext[i] = -1;
}
for (int r=0; r<nringsTmp; r++) {
int start = -1, end = -1;
for (int i=0; i<nranks; i++) {
if (rankToIdx[i] == -1) continue;
if (prevTmp[r*nranks+i] != -1) start = i;
if (nextTmp[r*nranks+i] != -1) end = i;
}
if (start != -1 && end != -1) {
subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
}
}
/* Get rings */
NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
/* Merge subprev/subnext into prev/next */
for (int r=0; r<nringsTmp; r++) {
for (int i=0; i<nidx; i++) {
if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
if (t == NTRANSPORTS-1) {
// Save node-level masters for trees
treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
}
}
}
//for (int r=0; r<nringsTmp; r++) {
//printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
//printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
//}
}
free(subvalues);
free(subprev);
free(subnext);
if (nringsTmp == 0) break;
}
minScore--;
if (nringsTmp > *nrings) {
*nrings = nringsTmp;
for (int i=0; i<nranks*(*nrings); i++) {
prev[i] = prevTmp[i];
next[i] = nextTmp[i];
}
}
} while (nringsTmp == 0 && minScore);
free(coords);
free(globalRankToIdx);
free(globalIdxToRank);
free(prevTmp);
free(nextTmp);
free(idxToRank);
free(rankToIdx);
free(groups);
free(subgroups);
*nthreads = nThreads;
/* Duplicate the rings in case of multinode+NVLink */
int nnodes = 0;
for (int r=0; r<nranks; r++) nnodes += treeIn[r];
int nvlink;
NCCLCHECK(ncclNvlinkGpu(&nvlink));
if (nnodes > 1 && nvlink) {
*nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
}
if (*nrings == 0) {
WARN("Could not create rings, falling back on simple ring");
*nrings = 1;
prev[rank] = (rank-1+nranks) % nranks;
next[rank] = (rank+1)%nranks;
}
int maxNrings = ncclParamMaxNrings();
int minNrings = ncclParamMinNrings();
if (maxNrings > 0 && minNrings > maxNrings) {
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
minNrings = 0;
}
if (minNrings > MAXCHANNELS) {
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
minNrings = MAXCHANNELS;
}
if (maxNrings > 0 && maxNrings <= *nrings) {
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
*nrings = maxNrings;
} else {
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
if (minNrings > 0 && minNrings > *nrings) {
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
*nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
}
}
NCCLCHECK(getEnvThreads(nthreads));
return ncclSuccess;
}
-57
Fájl megtekintése
@@ -1,57 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "topo.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
*path = realpath(busPath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", busPath);
return ncclSystemError;
}
return ncclSuccess;
}
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
int pciDistance(char* path1, char* path2) {
int score = 0;
int depth = 0;
int same = 1;
for (int i=0; i<strlen(path1); i++) {
if (path1[i] != path2[i]) same = 0;
if (path1[i] == '/') {
depth++;
if (same == 1) score++;
}
}
if (score <= 3) {
#ifdef __PPC__
// NUMA distance detection and PATH_SYS not supported on IBM/Power nodes
// nodes currently
return PATH_NODE;
#else
/* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
int numaId1 = getNumaId(path1);
int numaId2 = getNumaId(path2);
TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
#endif
}
if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX;
return PATH_PXB;
}
+74 -85
Fájl megtekintése
@@ -5,27 +5,53 @@
************************************************************************/
#include "utils.h"
#include "debug.h"
#include "nccl_net.h"
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
#include "nvmlwrap.h"
#include "core.h"
#include "nvmlwrap.h"
// Get current Compute Capability
int ncclCudaCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
int ccMajor, ccMinor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
return ccMajor*10+ccMinor;
}
ncclResult_t int64ToBusId(int64_t id, char* busId) {
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
return ncclSuccess;
}
ncclResult_t busIdToInt64(char* busId, int64_t* id) {
const int size = strlen(busId);
char* hexStr;
NCCLCHECK(ncclCalloc(&hexStr, size));
int hexOffset = 0;
for (int i=0; i<size; i++) {
char c = busId[i];
if (c == '.' || c == ':') continue;
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else break;
}
hexStr[hexOffset] = '\0';
*id = strtol(hexStr, NULL, 16);
free(hexStr);
return ncclSuccess;
}
// Convert a logical cudaDev index to the NVML device minor number
ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
nvmlDevice_t nvmlDevice;
unsigned int dev;
*nvmlDev = -1;
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
*nvmlDev = dev;
ncclResult_t getBusId(int cudaDev, int64_t *busId) {
// On most systems, the PCI bus ID comes back as in the 0000:00:00.0
// format. Still need to allocate proper space in case PCI domain goes
// higher.
char busIdStr[] = "00000000:00:00.0";
CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
NCCLCHECK(busIdToInt64(busIdStr, busId));
return ncclSuccess;
}
@@ -40,53 +66,6 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
return ncclSuccess;
}
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
char hostname[1024];
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugOutputLock);
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
fprintf(ncclDebugFile,"%s\n", buffer);
fflush(ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugOutputLock);
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
hostname, getpid(), gettid(), cudaDev, filefunc, line);
abort();
}
}
uint64_t getHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 5381;
@@ -100,27 +79,39 @@ uint64_t getHash(const char* string, int n) {
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
* This string can be overridden by using the NCCL_HOSTID env var.
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
uint64_t getHostHash(void) {
char uname[1024];
// Start off with the full hostname
(void) getHostName(uname, sizeof(uname), '\0');
int offset = strlen(uname);
int len;
// $(readlink /proc/self/ns/uts)
len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
if (len < 0) len = 0;
offset += len;
// $(readlink /proc/self/ns/mnt)
len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
if (len < 0) len = 0;
offset += len;
// Trailing '\0'
uname[offset]='\0';
TRACE(NCCL_INIT,"unique hostname '%s'", uname);
char hostHash[1024];
char *hostId;
return getHash(uname, strlen(uname));
// Fall back is the full hostname if something fails
(void) getHostName(hostHash, sizeof(hostHash), '\0');
int offset = strlen(hostHash);
if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
strncpy(hostHash, hostId, sizeof(hostHash));
} else {
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
}
fclose(file);
}
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
return getHash(hostHash, strlen(hostHash));
}
/* Generate a hash of the unique identifying string for this process
@@ -147,8 +138,6 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
if (!string) return 0;
const char* ptr = string;
// Ignore "^" or "=" prefix, will be detected outside of this function
if (ptr[0] == '^' || ptr[0] == '=') ptr++;
int ifNum = 0;
int ifC = 0;
+5 -3
Fájl megtekintése
@@ -41,7 +41,7 @@ typedef enum { ncclSuccess = 0,
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
ncclResult_t ncclGetVersion(int *version);
ncclResult_t ncclGetVersion(int *version);
ncclResult_t pncclGetVersion(int *version);
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
@@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
* Start a group call. All subsequent calls to NCCL may not block due to
* inter-CPU synchronization.
*/
ncclResult_t ncclGroupStart();
ncclResult_t ncclGroupStart();
ncclResult_t pncclGroupStart();
/*
* Group End
@@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart();
* End a group call. Wait for all calls since ncclGroupStart to complete
* before returning.
*/
ncclResult_t ncclGroupEnd();
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
#ifdef __cplusplus
} // end extern "C"
+7 -4
Fájl megtekintése
@@ -4,7 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "comm.h"
#include "info.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
struct ncclTree* tree = &args->channel->tree;
struct ncclTree* tree = &args->channel->treeUp;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
struct ncclTree* tree = &args->channel->tree;
struct ncclTree* tree = &args->channel->treeDn;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
}
@@ -157,7 +158,9 @@ void* persistentThread(void *comm_) {
}
} while (op == NULL);
op->idle = 0;
if (op->state != ncclProxyOpNone) ret = op->progress(op);
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
// yet and might be cancelled before they even start. Hold on on those.
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+77 -234
Fájl megtekintése
@@ -4,39 +4,9 @@
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "transport.h"
#include "nvmlwrap.h"
#include "comm.h"
#include "net.h"
#include "param.h"
#include "topo.h"
#include <cuda_runtime.h>
#include <assert.h>
#define NET_MAX_IFS 16
#define NET_MAX_GPUS 32
// Cache GPU-NIC distances to avoid re-computing them
#define NET_TVALUE_UNKNOWN 0ULL
static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
static int ncclNetNDev;
// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
#define NET_BITS_PER_IF 3
#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
static ncclTvalue_t getTvalue(short* distances, int ndev) {
ncclTvalue_t tvalue = 0;
for (int d=0; d<ndev; d++) {
ncclTvalue_t score = 1 + PATH_SYS - distances[d];
// Keep 3 bits of score info per dev
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
}
return tvalue;
}
static int getScore(ncclTvalue_t tvalue, int dev) {
return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
}
#include "graph.h"
struct netConnectInfo {
ncclNetHandle_t netHandle;
@@ -53,6 +23,7 @@ struct netSendResources {
int buffSize;
void* mhandle;
void* llMhandle;
void* ll128Mhandle;
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
@@ -70,228 +41,61 @@ struct netRecvResources {
int buffSize;
void* mhandle;
void* llMhandle;
void* ll128Mhandle;
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
};
static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
char* cudaPath = NULL;
char* nicPath = NULL;
ncclResult_t err;
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
err = ncclNetPciPath(dev, &nicPath);
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
if (nicPath) free(nicPath);
if (cudaPath) free(cudaPath);
/* Determine if two peers can communicate with NET */
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
return ncclSuccess;
}
static ncclResult_t netDevices(int* ndev, short** distances) {
NCCLCHECK(ncclNetDevices(ndev));
if (*ndev == 0) {
WARN("Error : Network returned 0 device");
return ncclSystemError;
}
if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
*distances = (short*)malloc(*ndev*sizeof(short));
if (*distances == NULL) return ncclSystemError;
// Find distance with current GPU
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
char line[1024];
sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
for (int d=0; d<*ndev; d++) {
NCCLCHECK(netDistance(cudaDev, d, *distances+d));
sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
}
INFO(NCCL_INIT|NCCL_NET, "%s", line);
return ncclSuccess;
}
/* Determine if we can communicate with the peer */
ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
ret[0] = ncclNetTvalues[cudaDev];
if (ret[0] == NET_TVALUE_UNKNOWN) {
if (cudaDev >= NET_MAX_GPUS) {
WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
return ncclInternalError;
}
int nDev;
short* distances;
NCCLCHECK(netDevices(&nDev, &distances));
ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
ncclNetNDev = nDev;
free(distances);
}
return ncclSuccess;
}
static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
int bestRank = -1;
int bestScore = 0;
for (int rank=0; rank<nranks; rank++) {
if (groups[rank] != group) continue;
for (int i=0; i<nranks; i++) {
ncclTvalue_t netValue = values[rank*nranks+i];
if (netValue != 0) {
ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
if (score >= minScore && score > bestScore) {
bestScore = score;
bestRank = rank;
}
// All other values should be the same, stop here for this rank
break;
}
}
}
return bestRank;
}
static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
// For the last rank, we don't need the absolute best score, just to be within minScore.
for (int rank=nranks-1; rank>=0; rank--) {
if (groups[rank] != group) continue;
if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
if (startRank == rank) continue;
for (int i=0; i<nranks; i++) {
ncclTvalue_t netValue = values[rank*nranks+i];
if (netValue != 0) {
ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
if (score >= minScore) {
return rank;
}
// All other values should be the same, stop here for this rank
break;
}
}
}
return -1;
}
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
int nGroups = groups[nranks-1] + 1;
int *cardUsed, *starts, *ends;
NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
NCCLCHECK(ncclCalloc(&starts, nGroups));
NCCLCHECK(ncclCalloc(&ends, nGroups));
for (int ring = 0; ring<*nringsRet; ring++) {
for (int group = 0; group<nGroups; group++) {
int nranksInGroup = 0;
int nsubGroups = 0;
for (int rank=0; rank<nranks; rank++)
if (groups[rank] == group) {
nranksInGroup++;
nsubGroups = std::max(subgroups[rank], nsubGroups);
}
starts[group] = ends[group] = -1;
// Receive on the rank closest to the NIC
for (int card=0; card<NET_MAX_IFS; card++) {
if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
int start = groupBestStart(nranks, groups, group, values, card, minScore);
// Send from any rank, but best on a different subgroup and close to the NIC also.
int end = (nranksInGroup == 1) ? start
: groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
//printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
if (start != -1 && end != -1) {
cardUsed[group*NET_MAX_IFS+card] = 1;
starts[group] = start;
ends[group] = end;
break;
}
}
if (starts[group] == -1 || ends[group] == -1) {
*nringsRet = ring;
goto done;
}
}
// Link groups together
for (int group = 0; group<nGroups; group++) {
int nextGroup = (group+1)%nGroups;
next[ring*nranks+ends[group]] = starts[nextGroup];
prev[ring*nranks+starts[nextGroup]] = ends[group];
}
}
done:
free(cardUsed);
free(starts);
free(ends);
return ncclSuccess;
}
int getDev(int cudaDev, int ringId) {
ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
int dev = 0;
int maxScore = 0;
for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
int skip = ringId+1;
while (skip) {
for (int d=0; d<ncclNetNDev; d++) {
if (getScore(tvalues, d) == maxScore) {
skip--;
if (skip == 0) { dev = d; goto end; }
}
}
}
end:
return dev;
}
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
*useGdr = 0;
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
if (gdrReadParam < 0) {
int nvlink;
NCCLCHECK(ncclNvlinkGpu(&nvlink));
NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
if (!nvlink) return ncclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = ncclParamNetGdrLevel();
short distance;
NCCLCHECK(netDistance(cudaDev, dev, &distance));
int distance;
NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
if (distance >= netGdrLevel) {
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
return ncclSuccess;
}
// Finally, check if the NIC supports it
int flags;
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
*useGdr = 1;
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
return ncclSuccess;
}
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
resources->netDev = getDev(cudaDev, channelId);
NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -303,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
resources->netDev = getDev(cudaDev, channelId);
NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -328,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@@ -343,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
send->conn.buff = recvMem->buff;
send->conn.llBuff = resources->devHostRecvMem->llBuff;
send->conn.ll128Buff = recvMem->ll128Buff;
// Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->devHostRecvMem->tail;
@@ -360,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
return ncclSuccess;
}
@@ -373,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
recv->conn.buff = recvMem->buff;
recv->conn.llBuff = recvMem->llBuff;
recv->conn.ll128Buff = recvMem->ll128Buff;
// Head/Tail/Opcount are always on host
recv->conn.tail = &resources->devHostRecvMem->tail;
@@ -388,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
return ncclSuccess;
}
@@ -397,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) {
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
@@ -410,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) {
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
@@ -437,7 +247,39 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
if (args->llMode) {
if (args->protocol == NCCL_PROTO_LL128) {
int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
if (args->tail < *recvTail) {
int buffSlot = args->tail%NCCL_STEPS;
if (sizesFifo[buffSlot] != -1) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
char* localBuff = (char*)localMem->ll128Buff;
int ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
// called threadfence()
uint64_t flag = args->tail + 1;
int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
ready = 1;
for (int i=0; i<nFifoLines; i++) {
if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
}
}
if (ready) {
// Send through network
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
}
}
}
} else if (args->protocol == NCCL_PROTO_LL) {
int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
if (size != -1) {
@@ -463,17 +305,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
} else if (args->tail < *recvTail) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
int stepSize = args->channel->buffSize/NCCL_STEPS;
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
// Send through network
int buffSlot = args->tail%NCCL_STEPS;
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
if (sizesFifo[buffSlot] != -1) {
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
}
}
}
@@ -512,11 +356,11 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
}
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
if (args->head < args->end) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
volatile uint64_t* sendHead = &resources->hostSendMem->head;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
@@ -533,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
if (done) {
args->head += args->sliceSteps;
if (args->llMode == 0) {
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
resources->hostRecvMem->tail = args->head;
}
@@ -553,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct ncclTransport netTransport = {
"NET",
netCanConnect,
netGetRings,
{ netSendSetup, netSendConnect, netSendFree, netSendProxy },
{ netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
};
@@ -8,7 +8,7 @@
#include "core.h"
#include "socket.h"
#include "net.h"
#include "topo.h"
#include "graph.h"
#include "utils.h"
#include "param.h"
@@ -107,7 +107,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
char* userIbEnv = getenv("NCCL_IB_HCA");
struct netIf userIfs[MAX_IB_DEVS];
bool searchNot = userIbEnv && userIbEnv[0] == '^';
if (searchNot) userIbEnv++;
bool searchExact = userIbEnv && userIbEnv[0] == '=';
if (searchExact) userIbEnv++;
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
@@ -199,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
}
if (moduleLoaded == 0) return ncclSystemError;
ncclResult_t ret = ncclSystemError;
void* ptr;
if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
struct ibv_mr* mr;
struct ibv_pd* pd;
if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
ret = ncclSuccess;
wrap_ibv_dereg_mr(mr);
}
wrap_ibv_dealloc_pd(pd);
}
cudaFree(ptr);
}
return ret;
return ncclSuccess;
}
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
*supportedTypes = NCCL_PTR_HOST;
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (ncclIbGdrSupport(dev) != ncclSuccess) {
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
return ncclSuccess;
}
*supportedTypes |= NCCL_PTR_CUDA;
@@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "comm.h"
#include "core.h"
#include "socket.h"
#include "net.h"
@@ -108,6 +108,7 @@ struct ncclSocketRequest {
void* data;
int size;
int ctrlFd;
int offset;
int used;
struct ncclSocketComm* comm;
struct ncclSocketTask* tasks[MAX_SOCKETS];
@@ -193,7 +194,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
}
if (nThreads == -2 || nSocksPerThread == -2) {
// Auto-detection
int autoNt=1, autoNs=1;
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
char vendorPath[PATH_MAX];
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
char* rPath = realpath(vendorPath, NULL);
@@ -213,6 +214,9 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
if (strcmp(vendor, "0x1d0f") == 0) { // AWS
autoNt = 2;
autoNs = 8;
} else if (strcmp(vendor, "0x1ae0") == 0) { // GCP
autoNt = 4;
autoNs = 1;
}
end:
if (nThreads == -2) nThreads = autoNt;
@@ -226,7 +230,7 @@ end:
}
*ns = nSocks;
*nt = nThreads;
INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
return ncclSuccess;
}
@@ -379,31 +383,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
return ncclInternalError;
}
r->size = data;
r->offset = 0;
r->used = 2; // done exchanging size
// divide into subtasks
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
int chunkOffset = 0, i = 0;
while (chunkOffset < r->size) {
int chunkSize = std::min(taskSize, r->size-chunkOffset);
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
chunkOffset += chunkSize;
if (r->comm->nSocks > 0) {
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
while (chunkOffset < r->size) {
int chunkSize = std::min(taskSize, r->size-chunkOffset);
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
chunkOffset += chunkSize;
}
}
r->nSubs = i;
}
if (r->used == 2) { // already exchanged size
int nCompleted = 0;
for (int i=0; i<r->nSubs; i++) {
struct ncclSocketTask* sub = r->tasks[i];
if (sub->result != ncclSuccess) return sub->result;
if (sub->offset == sub->size) nCompleted++;
}
if (nCompleted == r->nSubs) {
if (size) *size = r->size;
*done = 1;
r->used = 0;
if (r->nSubs > 0) {
int nCompleted = 0;
for (int i=0; i<r->nSubs; i++) {
struct ncclSocketTask* sub = r->tasks[i];
sub->used = 0;
if (sub->result != ncclSuccess) return sub->result;
if (sub->offset == sub->size) nCompleted++;
}
if (nCompleted == r->nSubs) {
if (size) *size = r->size;
*done = 1;
r->used = 0;
for (int i=0; i<r->nSubs; i++) {
struct ncclSocketTask* sub = r->tasks[i];
sub->used = 0;
}
}
} else { // progress request using main thread
if (r->offset < r->size) {
NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset));
}
if (r->offset == r->size) {
if (size) *size = r->size;
*done = 1;
r->used = 0;
}
}
}
+60 -392
Fájl megtekintése
@@ -4,15 +4,9 @@
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "comm.h"
#include "graph.h"
#include "utils.h"
#include "topo.h"
#include "transport.h"
#include "param.h"
#include <unistd.h>
#include <cuda_runtime.h>
#include <ctype.h>
#include "nvlink.h"
struct p2pConnectInfo {
int direct;
@@ -38,419 +32,91 @@ NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
static int busIdToCudaDev(const char* busId) {
static int busIdToCudaDev(int64_t busId) {
int ndev;
if (cudaGetDeviceCount(&ndev) != cudaSuccess)
return -1;
for (int i = 0; i < ndev; i++) {
char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
return -1;
if (strcmp(busId, devBusId) == 0) {
return i;
}
int64_t devBusId;
NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId));
if (busId == devBusId) return i;
}
// BusId was not found in our locally visible CUDA devices
return -1;
}
/* Determine if we can communicate with the peer through p2p */
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
// Do not use P2P across root complexes by default (provided CUDA permits it)
int p2pLevel = PATH_NODE;
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
int cpuCount;
NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
// Do not use P2P across sockets by default (provided CUDA permits it).
// When we are on a single socket, don't even use P2P through the CPU as
// it should be able to sustain two flows to sysmem faster than PCI P2P.
int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
// Disable P2P
*ret = 0;
if (p2pLevel == 0) return ncclSuccess;
// Rule out different nodes
if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
if (info1->hostHash != info2->hostHash) return ncclSuccess;
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
if (peerCudaDev == -1) {
int cudaDev1 = busIdToCudaDev(info1->busId);
int cudaDev2 = busIdToCudaDev(info2->busId);
if (cudaDev1 == -1 || cudaDev2 == -1) {
// Peer's CUDA device is not visible in this process
#if CUDART_VERSION >= 10010
// But in CUDA 10.1 we can still communicate with 'invisible' devices
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
// Check for NVLink/NVswitch including P2P access
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
if (nvlinkp2p > 0) {
*ret = nvlinkp2p;
int nvlink;
NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
if (nvlink > 0) {
*ret = 1;
return ncclSuccess;
}
#endif
return ncclSuccess;
}
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
// Do not detect topology if we're on the same GPU. Note this is not really supported.
if (myInfo->cudaDev == peerCudaDev) {
*ret = 1 + PATH_SYS;
if (cudaDev1 == cudaDev2) {
*ret = 1;
return ncclSuccess;
}
// See if CUDA can do P2P
int p2p;
if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
cudaDev1, info1->busId, cudaDev2, info2->busId);
return ncclSuccess;
}
if (p2p == 0) return ncclSuccess;
// Check for NVLink/NVswitch
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
if (nvlinkp2p > 0) {
*ret = nvlinkp2p;
int nvlink;
NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
if (nvlink > 0) {
*ret = 1;
return ncclSuccess;
}
// Finally compute the PCI distance and compare with the p2pLevel.
char* myPath;
char* peerPath;
ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
if (err1 == ncclSuccess && err2 == ncclSuccess) {
int distance = pciDistance(myPath, peerPath);
if (distance < p2pLevel) {
*ret = 1 + PATH_SYS - distance;
}
int distance;
NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
if (distance < p2pLevel) {
*ret = 1;
}
if (err1 == ncclSuccess) free(myPath);
if (err2 == ncclSuccess) free(peerPath);
return ncclSuccess;
}
#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
#define MAXGPUS_PCI 64
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
int nrings = 0;
ncclTvalue_t* line = matrix+current*n;
inTheRing[current] = 1;
int currentStep = (currentRing+1)*n-remaining;
rings[currentStep-1] = current;
if (remaining == 0) {
int looprank = rings[currentRing*n];
if (line[looprank] > 0) {
if (currentRing+1 == nRingsMax) {
nrings = 1;
} else {
line[looprank]--;
for (int i=0; i<n; i++) inTheRing[i] = 0;
if (connect) {
// First two slots are already set and we need to respect those constraints
inTheRing[rings[currentStep]] = 1;
nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
} else {
rings[(currentRing+1)*n] = 0;
nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
}
line[looprank]++;
for (int i=0; i<n; i++) inTheRing[i] = 1;
}
}
} else {
int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
int maxStep = 0;
for (int i=0; i<n; i++) {
if (inTheRing[i] == 0 && line[i] > 0) {
line[i]--;
int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
if (nr > nrings) {
nrings = nr;
maxStep = (nr+currentRing)*n;
ringsSave[currentStep] = i;
// Save the rest of the rings
for (int r=currentStep+1; r<maxStep; r++) {
ringsSave[r] = rings[r];
}
if (nrings + currentRing == nRingsMax) {
// We found an optimal solution. Let's stop there.
break;
}
}
line[i]++;
}
}
for (int r=currentStep; r<maxStep; r++) {
rings[r] = ringsSave[r];
}
}
inTheRing[current] = 0;
return nrings;
}
static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
if (nrings == 0) return 0;
// Copy rings by dup times
if (newNrings > MAXCHANNELS) {
newNrings = MAXCHANNELS;
}
for (int r=nrings; r<newNrings; r++) {
for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
}
return newNrings;
}
int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
int* inTheRing = (int*)malloc(sizeof(int)*nranks);
if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
for (int i=0; i<nranks; i++) inTheRing[i] = 0;
int nrings;
if (connect) {
inTheRing[rings[0]] = 1;
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
} else {
rings[0] = 0;
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
}
free(inTheRing);
return nrings;
}
static inline int findConnect(int nranks, int* ranks) {
for (int i = 0; i<nranks; i++) {
if (ranks[i] != -1) return i;
}
return -1;
}
int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
if (nrings == 0) return 0;
if (nrings > MAXCHANNELS) {
WARN("Max rings reached, limiting to %d", MAXCHANNELS);
nrings = MAXCHANNELS;
}
// Find existing constraints / connections
int connect = 0;
for (int r=0; r<nrings; r++) {
int start = findConnect(nranks, prev+r*nranks);
int end = findConnect(nranks, next+r*nranks);
if (start != -1 && end != -1) {
rings[r*nranks] = end;
rings[r*nranks+1] = start;
connect = 1;
}
}
// Compute rings
ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
free(matrix);
if (oversubscribe || connect) return compNrings;
if (compNrings && compNrings < nrings && nranks <= 4) {
// Try to oversubscribe to get a better result
int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
int nThreads = *nthreads;
int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
if (compNrings2 > compNrings*2) {
// Oversubscription worked.
for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
compNrings = compNrings2;
}
free(rings2);
}
// Duplicate the rings for direct NVLink
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
return compNrings;
}
int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
int nrings = nringsStart;
int connect = 0;
for (int r=0; r<nrings; r++) {
int start = findConnect(nranks, prev+r*nranks);
int end = findConnect(nranks, next+r*nranks);
if (start != -1 && end != -1) {
rings[r*nranks] = end;
rings[r*nranks+1] = start;
int cur = start;
for (int i=2; i<nranks; i++) {
int next = (cur+1) % nranks;
while (next == end || next == start) next = (next+1) % nranks;
if (values[cur*nranks+next] < minScore) {
return 0;
}
rings[r*nranks+i] = next;
cur = next;
}
connect = 1;
} else {
if (connect == 1 && r > 0) {
WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
return r;
} else {
return 0;
}
}
}
return nrings;
}
int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
for (int r=0; r<nringsStart; r++) {
for (int i=0; i<nranks; i++) {
rings[r*nranks+i] = i;
}
}
return nringsStart;
}
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
for (int score = PATH_SYS+1; score >= minScore; score--) {
int best = -1;
int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
for (int n = 0; n < nranks; n++) {
if (inRing[n]) continue;
if (values[rank*nranks+n] == score) {
if (end == -1) return n;
if (values[end*nranks+n] < worst_end_score) {
best = n;
worst_end_score = values[end*nranks+n];
}
}
}
if (best != -1) return best;
}
return -1;
}
int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
int connect = 0;
for (int r=0; r<nrings; r++) {
int start = findConnect(nranks, prev+r*nranks);
int end = findConnect(nranks, next+r*nranks);
int inRing[MAXGPUS_PCI];
for (int i=0; i<nranks; i++) inRing[i] = 0;
if (start == -1 && end == -1) {
if (connect == 1 && r > 0) {
WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
return r;
}
end = 0;
inRing[end] = 1;
start = findClosestPci(values, inRing, end, -1, nranks, minScore);
if (start == -1) return r;
} else if (start == -1 || end == -1) {
WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
return r;
} else {
connect = 1;
}
rings[r*nranks] = end;
rings[r*nranks+1] = start;
inRing[start] = inRing[end] = 1;
int cur = start;
for (int i=2; i<nranks; i++) {
int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
if (next == -1) return r;
inRing[next] = 1;
rings[r*nranks+i] = next;
cur = next;
}
// Check the loop is closing
inRing[end] = 0;
if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
if (connect == 0) return 1;
}
return nrings;
}
ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
if (*nringsRet == 0) return ncclSuccess;
int *rings;
NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
int nrings = *nringsRet;
// NVswitch
int nvswitchLinks = 0;
int directLinks = 0;
for (int rank=0; rank<nranks; rank++) {
for (int j=1; j<nranks; j++) {
int i = (rank + j) % nranks;
ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
if (j>1 && links != nvswitchLinks) {
WARN("Internal error : NVswitch links mismatch");
return ncclInternalError;
}
nvswitchLinks = links;
}
}
if (nvswitchLinks) {
// NVSwitch : Connect existing rings
int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
if (nringsConnected > 0) {
nrings = nringsConnected;
} else {
nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
// Or create new ones
nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
// And duplicate them
nrings = copyRings(nranks, rings, nrings, nrings*2);
}
goto end;
}
// point-to-point NVLink
for (int rank=0; rank<nranks; rank++) {
int links = 0;
for (int i=0; i<nranks; i++) {
ncclTvalue_t val = values[rank*nranks+i];
if (val >= CONNECT_NVSWITCH) continue;
links += val/CONNECT_NVLINK;
}
if (rank == 0) directLinks = links;
else directLinks = std::min(directLinks, links);
}
if (directLinks > 0) {
// NVLink : Connect rings or create new ones
if (nranks > MAXGPUS_NVLINKP2P) {
WARN("Recursive P2P computation cannot work for >8 GPUs");
return ncclInternalError;
}
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
goto end;
}
// PCIe or QPI : Connect rings or create new ones
nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
end:
*nringsRet = nrings;
for (int ring = 0; ring<nrings; ring++) {
for (int index=0; index<nranks; index++) {
int prevIndex = (index - 1 + nranks) % nranks;
int nextIndex = (index + 1) % nranks;
int curRank = rings[ring*nranks+index];
int prevRank = rings[ring*nranks+prevIndex];
int nextRank = rings[ring*nranks+nextIndex];
if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
}
}
free(rings);
return ncclSuccess;
}
@@ -462,7 +128,7 @@ end:
} while (0)
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct p2pSendResources* resources;
@@ -477,19 +143,20 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
info.direct = 1;
info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
return ncclInternalError;
} else {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d(=%d): %d %s",
peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
WARN("failed to peer with device %d(=%lx): %d %s",
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -498,12 +165,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
// Map IPC and enable P2P access
cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -512,7 +179,7 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
}
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
struct p2pRecvResources* resources;
@@ -534,11 +201,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
WARN("failed to peer with device %d(=%d): %d %s",
peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
WARN("failed to peer with device %d(=%lx): %d %s",
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -547,11 +214,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
// Map IPC and enable P2P access
cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -580,6 +247,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
send->conn.buff = remDevMem->buff;
send->conn.llBuff = remDevMem->llBuff;
send->conn.ll128Buff = remDevMem->ll128Buff;
send->conn.tail = &remDevMem->tail;
send->conn.opCountRem = &remDevMem->opCount;
send->conn.head = &resources->devMem->head;
@@ -610,6 +278,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
recv->conn.buff = resources->devMem->buff;
recv->conn.llBuff = resources->devMem->llBuff;
recv->conn.ll128Buff = resources->devMem->ll128Buff;
recv->conn.tail = &resources->devMem->tail;
recv->conn.opCountLoc = &resources->devMem->opCount;
recv->conn.head = &remDevMem->head;
@@ -638,7 +307,6 @@ ncclResult_t p2pRecvFree(void* resources) {
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
p2pGetRings,
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
};
+18 -91
Fájl megtekintése
@@ -4,13 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "utils.h"
#include "transport.h"
#include "param.h"
#include "comm.h"
#include "shm.h"
#include <unistd.h>
#include <cuda_runtime.h>
struct shmConnectInfo {
uint64_t pidHash;
@@ -40,98 +35,29 @@ struct shmRecvResources {
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
/* Determine if we can communicate with the peer */
ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
*ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
return ncclSuccess;
}
/* Determine two peers can communicate with SHM */
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
for (int rank = 0; rank<nranks; rank++) {
if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
}
return -1;
}
if (ncclParamShmDisable() == 1) return ncclSuccess;
static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
for (int rank = nranks-1; rank>=0; rank--) {
if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
}
return -1;
}
// Same host?
TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
if (info1->hostHash != info2->hostHash) return ncclSuccess;
#define MAXGROUPS 16
// Common /dev/shm (between containers) ?
TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
if (info1->shmDev != info2->shmDev) return ncclSuccess;
*ret = 1;
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
int nGroups = groups[nranks-1] + 1;
int starts[MAXGROUPS];
int ends[MAXGROUPS];
for (int ring = 0; ring<*nringsRet; ring++) {
int startGroup = -1, endGroup = -1;
for (int group = 0; group<nGroups; group++) {
int start = -1;
int end = -1;
int nranksInGroup = 0;
for (int rank=0; rank<nranks; rank++) {
if (groups[rank] != group) continue;
nranksInGroup++;
if (prev[ring*nranks+rank] != -1) {
if (start != -1) {
WARN("Multiple starts found in group");
}
start = rank;
startGroup = group;
}
if (next[ring*nranks+rank] != -1) {
if (end != -1) {
WARN("Multiple ends found in group");
}
end = rank;
endGroup = group;
}
}
if (nranksInGroup == 1) {
start = end = groupFirst(nranks, groups, group, -1);
} else {
if (start == -1)
start = groupFirst(nranks, groups, group, end);
if (end == -1)
end = groupLast(nranks, groups, group, start);
}
if (start == -1 || end == -1) {
*nringsRet = ring;
return ncclSuccess;
}
starts[group] = start;
ends[group] = end;
}
if (endGroup == -1 || startGroup == -1) {
startGroup = 0;
endGroup = nGroups-1;
// Close the loop
next[ring*nranks+ends[endGroup]] = starts[startGroup];
prev[ring*nranks+starts[startGroup]] = ends[endGroup];
}
int group = startGroup;
for (int i=0; i<nGroups-2; i++) {
int nextGroup = (group+1)%nGroups;
if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
next[ring*nranks+ends[group]] = starts[nextGroup];
prev[ring*nranks+starts[nextGroup]] = ends[group];
group = nextGroup;
}
// Connect with the last
next[ring*nranks+ends[group]] = starts[endGroup];
prev[ring*nranks+starts[endGroup]] = ends[group];
}
return ncclSuccess;
}
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
@@ -149,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
@@ -194,6 +120,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
send->transportResources = resources;
send->conn.buff = resources->devRemHostMem->buff;
send->conn.llBuff = resources->devRemHostMem->llBuff;
send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.opCountRem = &resources->devRemHostMem->opCount;
@@ -218,6 +145,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
recv->conn.buff = resources->devHostMem->buff;
recv->conn.llBuff = resources->devHostMem->llBuff;
recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
recv->conn.tail = &resources->devHostMem->tail;
recv->conn.opCountLoc = &resources->devHostMem->opCount;
return ncclSuccess;
@@ -242,7 +170,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
shmGetRings,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
};