Merge remote-tracking branch 'remotes/nccl/master' into rccl_2.5.6
Этот коммит содержится в:
+10
-4
@@ -104,26 +104,32 @@ endforeach(filename)
|
||||
|
||||
set(CC_SOURCES
|
||||
src/init.cc
|
||||
src/graph/trees.cc
|
||||
src/graph/rings.cc
|
||||
src/graph/paths.cc
|
||||
src/graph/search.cc
|
||||
src/graph/connect.cc
|
||||
src/graph/tuning.cc
|
||||
src/graph/topo.cc
|
||||
src/collectives/all_reduce.cc
|
||||
src/collectives/all_gather.cc
|
||||
src/collectives/reduce.cc
|
||||
src/collectives/broadcast.cc
|
||||
src/collectives/reduce_scatter.cc
|
||||
src/channel.cc
|
||||
src/misc/trees.cc
|
||||
src/misc/rings.cc
|
||||
src/misc/argcheck.cc
|
||||
src/misc/group.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/utils.cc
|
||||
src/misc/ibvwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/topo.cc
|
||||
src/transport/net.cc
|
||||
src/transport/net_ib.cc
|
||||
src/transport/net_socket.cc
|
||||
src/transport/p2p.cc
|
||||
src/transport/shm.cc
|
||||
src/transport.cc
|
||||
src/debug.cc
|
||||
src/group.cc
|
||||
src/bootstrap.cc
|
||||
src/enqueue.cc)
|
||||
|
||||
|
||||
@@ -25,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
|
||||
# Better define NVCC_GENCODE in your environment to the minimal set
|
||||
# of archs to reduce compile time.
|
||||
CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
|
||||
-gencode=arch=compute_35,code=sm_35 \
|
||||
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
|
||||
-gencode=arch=compute_50,code=sm_50 \
|
||||
-gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61
|
||||
@@ -46,7 +45,10 @@ endif
|
||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
|
||||
CXXFLAGS += -I $(CUDA_INC)
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
|
||||
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
|
||||
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Use addprefix so that we can specify more than one path
|
||||
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 4
|
||||
NCCL_PATCH := 8
|
||||
NCCL_MINOR := 5
|
||||
NCCL_PATCH := 6
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -17,7 +17,7 @@ DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
|
||||
|
||||
PKG_TIMESTAMP := $(shell date -R)
|
||||
ARCH := $(shell uname -m)
|
||||
PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
|
||||
PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g"| sed -e "s/aarch64/arm64/g")
|
||||
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
|
||||
ifeq ($(PKG_MULTIARCH),)
|
||||
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
|
||||
|
||||
+8
-7
@@ -9,10 +9,11 @@ include ../makefiles/version.mk
|
||||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
|
||||
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
|
||||
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
|
||||
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
|
||||
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
||||
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
|
||||
|
||||
##### lib files
|
||||
LIBNAME := libnccl.so
|
||||
@@ -94,17 +95,17 @@ $(PKGDIR)/nccl.pc : nccl.pc.in
|
||||
$(INCDIR)/%.h : %.h
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)
|
||||
cp -f $< $@
|
||||
install -m 644 $< $@
|
||||
|
||||
$(INCDIR)/nccl_%.h : include/nccl_%.h
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)
|
||||
cp -f $< $@
|
||||
install -m 644 $< $@
|
||||
|
||||
$(PKGDIR)/%.pc : %.pc
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(PKGDIR)
|
||||
cp -f $< $@
|
||||
install -m 644 $< $@
|
||||
|
||||
$(OBJDIR)/%.o : %.cc
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@@ -117,8 +118,8 @@ $(OBJDIR)/%.o : %.cc
|
||||
@rm -f $(@:%.o=%.d.tmp)
|
||||
|
||||
clean :
|
||||
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
|
||||
$(MAKE) -C collectives/device clean
|
||||
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
|
||||
|
||||
install : lib
|
||||
mkdir -p $(PREFIX)/lib
|
||||
|
||||
+50
-58
@@ -13,11 +13,6 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
// Always use sockets for bootstrap
|
||||
struct bootstrapNetHandle {
|
||||
union socketAddress connectAddr;
|
||||
};
|
||||
|
||||
struct bootstrapNetComm {
|
||||
int fd;
|
||||
};
|
||||
@@ -68,36 +63,36 @@ static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
|
||||
static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
|
||||
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
|
||||
// if dev >= 0, listen based on dev
|
||||
if (dev >= 0) {
|
||||
NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
|
||||
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
|
||||
} else if (dev == findSubnetIf) {
|
||||
// handle stores a remote address
|
||||
// need to find a local addr that is in the same network as the remote addr
|
||||
union socketAddress localAddr;
|
||||
char ifName[MAX_IF_NAME_SIZE];
|
||||
if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
// pass the local address back
|
||||
memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
|
||||
memcpy(connectAddr, &localAddr, sizeof(localAddr));
|
||||
} // Otherwise, handle stores a local address
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
|
||||
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
|
||||
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
|
||||
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
|
||||
*sendComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -145,21 +140,12 @@ static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
|
||||
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extId {
|
||||
ncclNetHandle_t extHandleRoot;
|
||||
void* extListenComm;
|
||||
uint64_t hostHash;
|
||||
pid_t pid;
|
||||
int fd;
|
||||
pthread_t boostrapThread;
|
||||
};
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
@@ -177,9 +163,8 @@ static ncclResult_t setFilesLimit() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* commId) {
|
||||
static void *bootstrapRoot(void* listenComm) {
|
||||
struct extInfo info;
|
||||
struct extId* id = (struct extId*)commId;
|
||||
ncclNetHandle_t *rankHandles = NULL;
|
||||
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
|
||||
ncclNetHandle_t zero = { 0 }; // for sanity checking
|
||||
@@ -191,7 +176,7 @@ static void *bootstrapRoot(void* commId) {
|
||||
/* Receive addresses from all ranks */
|
||||
int nranks = 0, c = 0;
|
||||
do {
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
|
||||
|
||||
@@ -216,22 +201,22 @@ static void *bootstrapRoot(void* commId) {
|
||||
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
|
||||
++c;
|
||||
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
|
||||
} while (c < nranks);
|
||||
TRACE(NCCL_INIT, "COLLECTED HANDLES");
|
||||
TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
|
||||
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
void *tmpSendComm;
|
||||
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
|
||||
}
|
||||
TRACE(NCCL_INIT, "SENT OUT HANDLES");
|
||||
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
||||
|
||||
out:
|
||||
bootstrapNetCloseListen(id->extListenComm);
|
||||
free(commId);
|
||||
bootstrapNetCloseListen(listenComm);
|
||||
if (rankHandles) free(rankHandles);
|
||||
if (rankHandlesRoot) free(rankHandlesRoot);
|
||||
|
||||
@@ -239,31 +224,28 @@ out:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
id->hostHash = getHostHash();
|
||||
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
|
||||
ncclUniqueId* threadIdCopy;
|
||||
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
|
||||
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
|
||||
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
void* listenComm;
|
||||
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
|
||||
pthread_t thread;
|
||||
pthread_create(&thread, NULL, bootstrapRoot, listenComm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
|
||||
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
extId* id = (extId*)out;
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
memset(id, 0, sizeof(ncclUniqueId));
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
|
||||
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
id->pid = -1;
|
||||
} else {
|
||||
id->pid = getpid();
|
||||
NCCLCHECK(bootstrapCreateRoot(out, false));
|
||||
NCCLCHECK(bootstrapCreateRoot(id, false));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -286,9 +268,9 @@ struct extState {
|
||||
int dev;
|
||||
};
|
||||
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
bool idFromEnv = id->pid < 0;
|
||||
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
|
||||
struct extState* state;
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
state->rank = rank;
|
||||
@@ -303,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
|
||||
void *tmpSendComm, *tmpRecvComm;
|
||||
// Pass the remote address to listen via info
|
||||
if (idFromEnv) {
|
||||
memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
|
||||
}
|
||||
// listen will return the local address via info (specify interface type 'findSubnetIf')
|
||||
state->dev = idFromEnv ? findSubnetIf : 0;
|
||||
@@ -323,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
|
||||
@@ -334,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
|
||||
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
|
||||
|
||||
@@ -377,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
void* tmpSendComm;
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
@@ -465,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapAbort(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
bootstrapNetCloseListen(state->extBstrapListenComm);
|
||||
bootstrapNetCloseSend(state->extBstrapRingSendComm);
|
||||
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
|
||||
free(state->peerBstrapHandles);
|
||||
free(state);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
|
||||
@@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ)
|
||||
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
|
||||
|
||||
clean:
|
||||
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
|
||||
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "all_gather.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
|
||||
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
|
||||
|
||||
@@ -13,7 +13,7 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
@@ -21,15 +21,15 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
@@ -134,3 +134,69 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,15 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "all_reduce.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
|
||||
IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
|
||||
IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
|
||||
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
|
||||
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
|
||||
|
||||
@@ -13,7 +13,7 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
@@ -34,7 +34,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
|
||||
@@ -103,25 +103,31 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = args->lastChunkSize;
|
||||
int chunkSize = args->lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclPrimitives<1, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<1, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -137,8 +143,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitives<1, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<1, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -170,6 +177,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
|
||||
|
||||
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
@@ -177,10 +186,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
@@ -189,7 +195,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
@@ -197,7 +203,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
@@ -206,7 +212,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
@@ -214,7 +220,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
@@ -222,7 +228,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
@@ -238,16 +244,21 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
@@ -265,6 +276,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
@@ -281,3 +293,143 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int slice;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* treeUp = &channel->treeUp;
|
||||
struct ncclTree* treeDn = &channel->treeDn;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = args->lastChunkSize;
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
if (treeUp->up == -1) {
|
||||
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
}
|
||||
} else {
|
||||
if (tid < nthreadsSplit) {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (treeUp->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (treeDn->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "broadcast.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
|
||||
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
|
||||
|
||||
@@ -13,7 +13,7 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
@@ -36,7 +36,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
@@ -121,3 +121,53 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -9,10 +9,8 @@
|
||||
#ifndef NCCL_DEVICE_COMMON_H_
|
||||
#define NCCL_DEVICE_COMMON_H_
|
||||
|
||||
#include "../collectives.h"
|
||||
#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
#include "nccl.h"
|
||||
#include <type_traits>
|
||||
|
||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if abort == 1
|
||||
@@ -23,6 +21,7 @@
|
||||
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
|
||||
__syncthreads(); \
|
||||
if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
|
||||
#define __syncwarp()
|
||||
#else
|
||||
static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
uint32_t popc;
|
||||
@@ -36,12 +35,13 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
#endif
|
||||
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype)
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL128, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype)
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
@@ -79,7 +79,7 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
// Must be consistent with ncclColl_t
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
@@ -88,9 +88,9 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclFunc_t = void (*)(struct CollectiveArgs*);
|
||||
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
|
||||
|
||||
static const __device__ constexpr ncclFunc_t ncclFuncs[]{
|
||||
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
@@ -123,36 +123,43 @@ struct Caller<f, f + 1>{
|
||||
inline
|
||||
__device__
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
|
||||
if (c->funcIndex < 160) {
|
||||
if (c->funcIndex % 4 == 0) ncclBroadcastRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 4 == 1) ncclBroadcastRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 4 == 2) ncclBroadcastTree_copy_i8(&c->args);
|
||||
else ncclBroadcastTreeLL_copy_i8(&c->args);
|
||||
if (c->funcIndex < 240) {
|
||||
if (c->funcIndex % 6 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 2) ncclBroadcastTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
|
||||
else ncclBroadcastRing_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex < 320) Caller<160, 320>::call(c);
|
||||
else if (c->funcIndex < 480) {
|
||||
if (c->funcIndex % 4 == 0) ncclAllGatherRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 4 == 1) ncclAllGatherRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 4 == 2) ncclAllGatherTree_copy_i8(&c->args);
|
||||
else ncclAllGatherTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex < 480) Caller<240, 480>::call(c);
|
||||
else if (c->funcIndex < 720) {
|
||||
if (c->funcIndex % 6 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 2) ncclAllGatherTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 6 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
|
||||
else ncclAllGatherRing_copy_i8(&c->args);
|
||||
}
|
||||
else Caller<480, 800>::call(c);
|
||||
else Caller<720, 1200>::call(c);
|
||||
}
|
||||
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
|
||||
int* d = (int*)dst;
|
||||
int* s = (int*)src;
|
||||
// When aggregation is effective, if some threads have aborted inside the LL kernel,
|
||||
// make sure the rest of the threads abort as well
|
||||
exitIfAbortBarrier(0, abortCount);
|
||||
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
|
||||
__syncthreads();
|
||||
}
|
||||
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
|
||||
|
||||
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
int abort = tid == 0 ? *(comm->abortFlag) : 0;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
|
||||
__syncthreads();
|
||||
if (tid == 0) hostColl->active = 0;
|
||||
}
|
||||
|
||||
extern __device__ volatile uint64_t* ncclShmem;
|
||||
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
|
||||
@@ -161,12 +168,14 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
|
||||
|
||||
/* Kernels with the first operation inlined */
|
||||
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
|
||||
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
int tid = threadIdx.x; \
|
||||
int bid = blockIdx.x; \
|
||||
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
|
||||
ncclShmem = shmem; \
|
||||
__shared__ struct ncclColl localColl; \
|
||||
__shared__ uint32_t abortCount; \
|
||||
__shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE]; \
|
||||
if (tid == 0) abortCount = 0; \
|
||||
__syncthreads(); \
|
||||
\
|
||||
@@ -174,12 +183,13 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
struct ncclColl* c; \
|
||||
channel->abortCount = &abortCount; \
|
||||
channel->sync = sync; \
|
||||
if (bid == 0) { \
|
||||
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
|
||||
c = &firstColl; \
|
||||
} else { \
|
||||
c = &localColl; \
|
||||
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
|
||||
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm, &abortCount); \
|
||||
} \
|
||||
while (1) { \
|
||||
if (tid < c->args.nThreads) { \
|
||||
@@ -198,7 +208,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
\
|
||||
/* Load next collective operation*/ \
|
||||
c = &localColl; /* for bid 0 */ \
|
||||
load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
|
||||
load_coll(c, channel->devCollectives+nextIndex, tid, comm, &abortCount); \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -212,13 +222,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
|
||||
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_KERN_##op(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
|
||||
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_KERN_##op(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
|
||||
|
||||
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
|
||||
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
|
||||
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
|
||||
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
|
||||
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
|
||||
|
||||
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
|
||||
@@ -232,6 +243,17 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
|
||||
|
||||
// Reduction define all functions
|
||||
#define IMPL_COLL_R(collf, colln) \
|
||||
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); \
|
||||
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
|
||||
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); \
|
||||
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
|
||||
|
||||
// Copy primitives only define one
|
||||
#define IMPL_COLL_C(collf, colln) \
|
||||
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
|
||||
|
||||
#define COLL_UNROLL 2
|
||||
|
||||
#endif
|
||||
|
||||
@@ -300,8 +300,6 @@ __device__ void ReduceCopyMulti(const int tid, const int nthreads,
|
||||
}
|
||||
}
|
||||
|
||||
#define WARP_SIZE 64
|
||||
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
|
||||
|
||||
@@ -9,5 +9,75 @@
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
__device__ volatile uint64_t* ncclShmem;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL128, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
|
||||
// Workaround for https://reviews.llvm.org/D55580
|
||||
__device__ void ncclWorkaroundClangD55580() {}
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef OP128_H_
|
||||
#define OP128_H_
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(ptr);
|
||||
v1=LOAD(ptr+1);
|
||||
}
|
||||
|
||||
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
|
||||
STORE(ptr, v0);
|
||||
STORE(ptr+1, v1);
|
||||
}
|
||||
|
||||
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
|
||||
return (uint64_t*)shmemGenericPtr;
|
||||
}
|
||||
|
||||
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(shmemAsmPtr);
|
||||
v1=LOAD(shmemAsmPtr+1);
|
||||
}
|
||||
|
||||
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
|
||||
STORE(shmemAsmPtr, v0);
|
||||
STORE(shmemAsmPtr+1, v1);
|
||||
}
|
||||
#else
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
|
||||
: "=l"(v0), "=l"(v1) : "l"(ptr));
|
||||
}
|
||||
|
||||
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
|
||||
asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
|
||||
:: "l"(v0), "l"(v1), "l"(ptr));
|
||||
}
|
||||
|
||||
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
|
||||
uint64_t* shmemAsmPtr;
|
||||
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
|
||||
return shmemAsmPtr;
|
||||
}
|
||||
|
||||
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
|
||||
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
|
||||
}
|
||||
|
||||
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
|
||||
asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
|
||||
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -38,15 +38,27 @@ class ncclPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
const int stepSize;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
const int stepSize;
|
||||
struct ncclConnInfo* recvConn[NRECV];
|
||||
struct ncclConnInfo* sendConn[NSEND];
|
||||
volatile uint64_t* waitPtr;
|
||||
struct ncclConnInfo* recvConn = NULL;
|
||||
volatile uint64_t* recvConnHeadPtr = NULL;
|
||||
uint64_t recvConnHead;
|
||||
volatile uint64_t* recvConnTailPtr = NULL;
|
||||
uint64_t recvConnTail;
|
||||
uint64_t recvConnTailCache; // Cache last seen value
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
uint64_t sendConnHead[NSEND];
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
const T* recvDirectBuff[NRECV];
|
||||
T* sendDirectBuff[NSEND];
|
||||
@@ -56,12 +68,12 @@ class ncclPrimitives {
|
||||
struct ncclDevComm* comm;
|
||||
uint32_t* abortCount;
|
||||
|
||||
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
__device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
|
||||
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
|
||||
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
|
||||
__device__ void barrier() {
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
@@ -72,11 +84,11 @@ class ncclPrimitives {
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
|
||||
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
|
||||
if (mismatch) {
|
||||
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
|
||||
STORE(comm->fatalDevError, ncclDevAssertedMismatch);
|
||||
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
|
||||
} else if (conn && LOAD(conn->opCountRem) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
@@ -84,68 +96,76 @@ class ncclPrimitives {
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
|
||||
inline __device__ int checkAbort(int i, int send) {
|
||||
spins++;
|
||||
if (spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
checkMismatch(remoteOpCount);
|
||||
if (wid == i) checkMismatch(send ? sendConn : recvConn);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
__device__ void waitRecv(int i) {
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
recvStep[i] += SLICESTEPS;
|
||||
if (tid == i) {
|
||||
if (sendConnHeadPtr) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t t0 = clock64();
|
||||
#endif
|
||||
while (LOAD(waitPtr) < recvStep[i]) {
|
||||
if (checkAbort(recvConn[i]->opCountRem)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void waitSend(int i) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
sendStep[i] += SLICESTEPS;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t t0 = clock64();
|
||||
#endif
|
||||
while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
|
||||
sendConnHead[i] = LOAD(waitPtr);
|
||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
if (sendConnFifoPtr) {
|
||||
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
|
||||
}
|
||||
sendConnHead += SLICESTEPS;
|
||||
}
|
||||
}
|
||||
|
||||
inline __device__ void postRecv(int i) {
|
||||
STORE(recvConn[i]->head, recvStep[i]);
|
||||
inline __device__ void waitRecv() {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
if (recvConnTailPtr) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t t0 = clock64();
|
||||
#endif
|
||||
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
if (checkAbort(wid, 0)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
recvConnTail += SLICESTEPS;
|
||||
}
|
||||
}
|
||||
|
||||
inline __device__ void postSend(int i) {
|
||||
if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
|
||||
STORE(sendConn[i]->tail, sendStep[i]);
|
||||
inline __device__ void incRecv(int i) {
|
||||
recvStep[i] += SLICESTEPS;
|
||||
}
|
||||
inline __device__ void postRecv() {
|
||||
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
|
||||
}
|
||||
|
||||
__device__ void postSendSize(int i, int size) {
|
||||
if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
|
||||
inline __device__ void incSend(int i) {
|
||||
sendStep[i] += SLICESTEPS;
|
||||
}
|
||||
inline __device__ void postSend() {
|
||||
if (sendConnTailPtr) {
|
||||
if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
|
||||
STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
|
||||
}
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
__device__ const T* directRecvPtr(int i, int directOffset) {
|
||||
inline __device__ const T* directRecvPtr(int i, int directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
|
||||
#else
|
||||
@@ -154,19 +174,38 @@ class ncclPrimitives {
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
__device__ T* directSendPtr(int i, int directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
inline __device__ T* directSendPtr(int i, int directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
|
||||
#else
|
||||
#else
|
||||
return sendPtr(i);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void
|
||||
inline __device__ void
|
||||
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
|
||||
int offset = 0;
|
||||
int sliceSize = stepSize * SLICESTEPS;
|
||||
int sliceSize = stepSize*SLICESTEPS;
|
||||
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
|
||||
|
||||
const T* srcs[RECV*NRECV+SRC];
|
||||
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
|
||||
@@ -182,11 +221,11 @@ class ncclPrimitives {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
|
||||
}
|
||||
|
||||
#pragma unroll 1
|
||||
#pragma unroll
|
||||
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
|
||||
int realSize = max(0, min(sliceSize, nelem-offset));
|
||||
FOR_SEND(waitSend);
|
||||
FOR_RECV(waitRecv);
|
||||
int realSize = max(0, min(dataSize, nelem-offset));
|
||||
if (SEND) waitSend(realSize*sizeof(T));
|
||||
if (RECV) waitRecv();
|
||||
if (realSize > 0) {
|
||||
barrier();
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
@@ -202,458 +241,187 @@ class ncclPrimitives {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
#endif
|
||||
}
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
if (tid == 0)
|
||||
{
|
||||
FOR_SEND(postSendSize, realSize*sizeof(T));
|
||||
__threadfence_system();
|
||||
FOR_SEND(postSend);
|
||||
FOR_RECV(postRecv);
|
||||
barrier();
|
||||
FOR_SEND(incSend);
|
||||
FOR_RECV(incRecv);
|
||||
if (tid >= nthreads-WARP_SIZE) {
|
||||
if (SEND) {
|
||||
if (realSize > 0 && wid == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
postSend();
|
||||
}
|
||||
if (RECV) postRecv();
|
||||
}
|
||||
|
||||
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
|
||||
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
|
||||
offset += sliceSize;
|
||||
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
|
||||
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
|
||||
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
|
||||
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
|
||||
offset += realSize;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
recvConn[i] = conn;
|
||||
recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
|
||||
recvStep[i] = LOAD(&recvConn[i]->step);
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
recvBuff[i] = (const T*)LOAD(&conn->buff);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
// Return credits in case we rounded up.
|
||||
if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
|
||||
if (tid == i) {
|
||||
waitPtr = LOAD(&recvConn[i]->tail);
|
||||
STORE(recvConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
recvDirectBuff[i] = NULL;
|
||||
if (directBuff && recvConn[i]->direct) {
|
||||
if (directBuff && LOAD(&conn->direct)) {
|
||||
recvDirectBuff[i] = directBuff;
|
||||
if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
|
||||
if (tid == 0) STORE(conn->ptrExchange, directBuff);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) recvConn = conn;
|
||||
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
sendConn[i] = conn;
|
||||
sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
|
||||
sendStep[i] = LOAD(&sendConn[i]->step);
|
||||
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
if (tid == WARP_SIZE+i) {
|
||||
waitPtr = LOAD(&sendConn[i]->head);
|
||||
sendConnHead[i] = LOAD(waitPtr);
|
||||
STORE(sendConn[i]->opCountLoc, opCount);
|
||||
__device__ __forceinline__ void loadRecvSync() {
|
||||
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
|
||||
recvConnTailPtr = LOAD(&recvConn->tail);
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
recvConnHeadPtr = LOAD(&recvConn->head);
|
||||
// Return credits in case we rounded up.
|
||||
STORE(recvConnHeadPtr, recvConnHead);
|
||||
// Update opCount in case we skipped some operations
|
||||
STORE(recvConn->opCountLoc, opCount);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
sendBuff[i] = (T*)LOAD(&conn->buff);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
sendDirectBuff[i] = NULL;
|
||||
if (directBuff && sendConn[i]->direct) {
|
||||
void* volatile* ptr = sendConn[i]->ptrExchange;
|
||||
if (directBuff && LOAD(&conn->direct)) {
|
||||
void* volatile* ptr = LOAD(&conn->ptrExchange);
|
||||
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
|
||||
__syncthreads();
|
||||
barrier();
|
||||
if (tid == 0) STORE(ptr, NULL);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) sendConn = conn;
|
||||
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
|
||||
nsend++;
|
||||
}
|
||||
|
||||
__device__ void saveRecvConn(int i) {
|
||||
if (tid == i) {
|
||||
STORE(&recvConn[i]->step, recvStep[i]);
|
||||
__threadfence_system();
|
||||
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
__device__ void loadSendSync() {
|
||||
if (tid < nsend) {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
STORE(sendConn->opCountLoc, opCount);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nsend) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void saveSendConn(int i) {
|
||||
if (tid == WARP_SIZE+i) {
|
||||
STORE(&sendConn[i]->step, sendStep[i]);
|
||||
__device__ void saveRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
STORE(&recvConn->step, recvConnHead);
|
||||
STORE(recvConn->opCountLoc, opCount+1);
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void saveSendSync() {
|
||||
if (tid < nsend) {
|
||||
STORE(&sendConn->step, sendConnHead);
|
||||
STORE(sendConn->opCountLoc, opCount+1);
|
||||
__threadfence_system();
|
||||
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
|
||||
// Make sure step is updated before we read it
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
|
||||
// Make sure step is updated before we read it.
|
||||
abortCount = channel->abortCount;
|
||||
__syncthreads();
|
||||
barrier();
|
||||
|
||||
// disable directBuff
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
send(const T* src, int nelem) {
|
||||
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
directSend(const T* src, int directOffset, int nelem) {
|
||||
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
recv(T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
directRecv(T* dst, int directOffset, int nelem) {
|
||||
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
copySend(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
directCopySend(const T* src, T* dst, int directOffset, int nelem) {
|
||||
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
recvCopySend(T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
directRecvCopySend(T* dst, int directOffset, int nelem) {
|
||||
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
recvReduceSend(const T* src, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
__device__ __forceinline__ void
|
||||
directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
|
||||
// Direct is only for the send part
|
||||
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ ~ncclPrimitives() {
|
||||
// Save steps for next collective. Have thread 0 do it to be compatible
|
||||
// with the way LL works.
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class FUNC, int NRECV, int NSEND>
|
||||
class ncclLLPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn[NRECV];
|
||||
struct ncclConnInfo* sendConn[NSEND];
|
||||
volatile uint64_t* waitPtr;
|
||||
volatile uint64_t* postPtr;
|
||||
volatile int* fifoPtr;
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
uint64_t sendConnHead;
|
||||
union ncclLLFifoLine* recvBuff[NRECV];
|
||||
union ncclLLFifoLine* sendBuff[NSEND];
|
||||
struct ncclDevComm* comm;
|
||||
uint32_t* abortCount;
|
||||
|
||||
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
__device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
__device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
// Exit If Abort Barrier : make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if val == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
__device__ void exitIfAbortLocalBarrier() {
|
||||
uint32_t popc;
|
||||
asm ("{");
|
||||
asm volatile (" .reg .pred barr_pred;");
|
||||
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
|
||||
asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
|
||||
asm ("}");
|
||||
if (popc) {
|
||||
// Make sure threads not participating in the operation get the abort and all threads exit
|
||||
exitIfAbortBarrier(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
|
||||
if (mismatch > 20) {
|
||||
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
|
||||
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
|
||||
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
|
||||
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
|
||||
spins++;
|
||||
if (spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
checkMismatch(remoteOpCount);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
__device__ void waitSend(int i, int nbytes) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
|
||||
sendConnHead = LOAD(waitPtr);
|
||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||
}
|
||||
if (fifoPtr) {
|
||||
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void postRecv(int i) {
|
||||
recvStep[i]++;
|
||||
if (tid == i) STORE(postPtr, recvStep[i]);
|
||||
}
|
||||
|
||||
__device__ void postSend(int i, int offset) {
|
||||
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||
// data corruption when flag loops over.
|
||||
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
}
|
||||
sendStep[i]++;
|
||||
}
|
||||
|
||||
__device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
uint32_t flag = recvFlag(i);
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
do {
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
|
||||
if (i4[1] == flag && i4[3] == flag) break;
|
||||
} while (!checkAbort(recvConn[i]->opCountRem));
|
||||
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
|
||||
#else
|
||||
do {
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
if (checkAbort(recvConn[i]->opCountRem)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
#endif
|
||||
return val64;
|
||||
}
|
||||
|
||||
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
i4[1] = flag;
|
||||
i4[2] = (val >> 32);
|
||||
i4[3] = flag;
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
|
||||
#else
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Using memcpy handles misaligned pointers.
|
||||
__device__ uint64_t readAL(uint64_t* src) {
|
||||
uint64_t val;
|
||||
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
|
||||
return val;
|
||||
}
|
||||
|
||||
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
|
||||
memcpy((char*)dst, (char*)&val, nbytes);
|
||||
}
|
||||
|
||||
template <int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
|
||||
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
|
||||
FOR_SEND(waitSend, nbytes*2);
|
||||
barrier();
|
||||
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
||||
uint64_t* srcPack = (uint64_t*)srcPtr;
|
||||
uint64_t* dstPack = (uint64_t*)dstPtr;
|
||||
int offset = tid;
|
||||
// Do multiples of 64 bits
|
||||
#pragma unroll 1
|
||||
for (; offset<npack; offset+=nthreads) {
|
||||
// Recv : local, then intra-node, then inter-node
|
||||
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
||||
if (RECV) {
|
||||
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
val = MULTI<FUNC, T>()(readLL(i, offset), val);
|
||||
}
|
||||
}
|
||||
|
||||
// Send : inter-node, then intra-node, then local
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
|
||||
storeLL(sendPtr(0)+offset, val, sendFlag(0));
|
||||
}
|
||||
if (DST) {
|
||||
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
|
||||
// Last incomplete word
|
||||
storeAL(dstPack+offset, val, nbytes & 0x7);
|
||||
} else {
|
||||
storeAL(dstPack+offset, val, sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
#else
|
||||
exitIfAbortLocalBarrier();
|
||||
#endif
|
||||
FOR_RECV(postRecv);
|
||||
FOR_SEND(postSend, offset);
|
||||
}
|
||||
|
||||
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
recvConn[i] = conn;
|
||||
recvBuff[i] = recvConn[i]->llBuff;
|
||||
recvStep[i] = recvConn[i]->step;
|
||||
if (tid == i) {
|
||||
postPtr = recvConn[i]->head;
|
||||
STORE(recvConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendConn[i] = conn;
|
||||
sendBuff[i] = sendConn[i]->llBuff;
|
||||
sendStep[i] = sendConn[i]->step;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
waitPtr = sendConn[i]->head;
|
||||
fifoPtr = sendConn[i]->fifo;
|
||||
sendConnHead = LOAD(waitPtr);
|
||||
STORE(sendConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
nsend++;
|
||||
}
|
||||
|
||||
__device__ void saveRecvConn(int i) {
|
||||
if (tid == i) {
|
||||
recvConn[i]->step = recvStep[i];
|
||||
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void saveSendConn(int i) {
|
||||
if (tid == WARP_SIZE+i) {
|
||||
sendConn[i]->step = sendStep[i];
|
||||
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
|
||||
// Make sure step is updated before we read it.
|
||||
abortCount = channel->abortCount;
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
}
|
||||
|
||||
__device__ void send(const T* src, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recv(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceSend(const T* src, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void copySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvCopySend(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ ~ncclLLPrimitives() {
|
||||
__device__ __forceinline__ ~ncclPrimitives() {
|
||||
// Save steps for the next operation
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||
saveRecvSync();
|
||||
saveSendSync();
|
||||
}
|
||||
};
|
||||
|
||||
#include "prims_ll.h"
|
||||
//#include "prims_ll128.h"
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define INIT_COUNTER \
|
||||
if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
|
||||
if (tid == 0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
|
||||
wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
|
||||
|
||||
#define ACCUMULATE_COUNTER(prim) \
|
||||
if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
|
||||
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
|
||||
+ ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
|
||||
+ wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
|
||||
__ATOMIC_SEQ_CST); \
|
||||
|
||||
@@ -0,0 +1,293 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
template <typename T, class FUNC, int NRECV, int NSEND>
|
||||
class ncclLLPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn = NULL;
|
||||
volatile uint64_t* recvConnHeadPtr = NULL;
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
union ncclLLFifoLine* recvBuff[NRECV];
|
||||
union ncclLLFifoLine* sendBuff[NSEND];
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
|
||||
if (mismatch > 20) {
|
||||
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
|
||||
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
|
||||
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
|
||||
} else if (conn && LOAD(conn->opCountRem) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int i, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
if (wid == i) checkMismatch(send ? sendConn : recvConn);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
if (sendConnHeadPtr) {
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, size);
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
inline __device__ void incRecv(int i) {
|
||||
recvStep[i] += 1;
|
||||
}
|
||||
inline __device__ void postRecv() {
|
||||
barrier();
|
||||
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
|
||||
}
|
||||
|
||||
inline __device__ void incSend(int i, int offset) {
|
||||
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||
// data corruption when flag loops over.
|
||||
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
}
|
||||
sendStep[i]++;
|
||||
}
|
||||
|
||||
__device__ uint64_t readLL(int i, int offset) {
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
uint32_t flag = recvFlag(i);
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
do {
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
|
||||
} while ((i4[1] != flag) || (i4[3] != flag));
|
||||
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
|
||||
#else
|
||||
do {
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
if (checkAbort(i, 0)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
#endif
|
||||
return val64;
|
||||
}
|
||||
|
||||
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
i4[1] = flag;
|
||||
i4[2] = (val >> 32);
|
||||
i4[3] = flag;
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
|
||||
#else
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Using memcpy handles misaligned pointers.
|
||||
__device__ uint64_t readAL(uint64_t* src) {
|
||||
uint64_t val;
|
||||
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
|
||||
return val;
|
||||
}
|
||||
|
||||
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
|
||||
memcpy((char*)dst, (char*)&val, nbytes);
|
||||
}
|
||||
|
||||
template <int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
|
||||
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
|
||||
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
||||
uint64_t* srcPack = (uint64_t*)srcPtr;
|
||||
uint64_t* dstPack = (uint64_t*)dstPtr;
|
||||
int offset = tid;
|
||||
|
||||
// Always waitSend in case of cleanup
|
||||
if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine));
|
||||
|
||||
// Do multiples of 64 bits
|
||||
#pragma unroll 1
|
||||
for (; offset<npack; offset+=nthreads) {
|
||||
// Recv : local, then intra-node, then inter-node
|
||||
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
||||
if (RECV) {
|
||||
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
val = MULTI<FUNC, T>()(readLL(i, offset), val);
|
||||
}
|
||||
}
|
||||
|
||||
// Send : inter-node, then intra-node, then local
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
|
||||
storeLL(sendPtr(0)+offset, val, sendFlag(0));
|
||||
}
|
||||
if (DST) {
|
||||
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
|
||||
// Last incomplete word
|
||||
storeAL(dstPack+offset, val, nbytes & 0x7);
|
||||
} else {
|
||||
storeAL(dstPack+offset, val, sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
FOR_RECV(incRecv); if (RECV) postRecv();
|
||||
FOR_SEND(incSend, offset);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
recvBuff[i] = LOAD(&conn->llBuff);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) recvConn = conn;
|
||||
nrecv++;
|
||||
}
|
||||
__device__ __forceinline__ void loadRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
recvConnHeadPtr = LOAD(&recvConn->head);
|
||||
recvConnHead = LOAD(&recvConn->step);
|
||||
// Update opCount in case we skipped some operations
|
||||
STORE(recvConn->opCountLoc, opCount);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendBuff[i] = LOAD(&conn->llBuff);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) sendConn = conn;
|
||||
nsend++;
|
||||
}
|
||||
__device__ __forceinline__ void loadSendSync() {
|
||||
if (tid < nsend) {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
STORE(sendConn->opCountLoc, opCount);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
STORE(&recvConn->step, recvConnHead);
|
||||
STORE(recvConn->opCountLoc, opCount+1);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveSendSync() {
|
||||
if (tid < nsend) {
|
||||
STORE(&sendConn->step, sendConnHead);
|
||||
STORE(sendConn->opCountLoc, opCount+1);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
}
|
||||
|
||||
__device__ void send(const T* src, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recv(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceSend(const T* src, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void copySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvCopySend(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ ~ncclLLPrimitives() {
|
||||
// Save steps for the next operation
|
||||
saveRecvSync();
|
||||
saveSendSync();
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,427 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "op128.h"
|
||||
|
||||
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
template <typename T, class FUNC, int NRECV, int NSEND>
|
||||
class ncclLL128Primitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
const int warp;
|
||||
const bool flagThread;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn = NULL;
|
||||
volatile uint64_t* recvConnHeadPtr = NULL;
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
uint64_t* recvBuff[NRECV];
|
||||
uint64_t* sendBuff[NSEND];
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
volatile uint64_t* shmem;
|
||||
uint32_t* sync;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
|
||||
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
|
||||
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
|
||||
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
|
||||
} else {
|
||||
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
|
||||
if (mismatch > 20) {
|
||||
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
|
||||
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
|
||||
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
|
||||
} else if (conn && LOAD(conn->opCountRem) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int i, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
if (wid == i) checkMismatch(send ? sendConn : recvConn);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
if (sendConnHeadPtr) {
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
STORE(sendConnFifoPtr+sendStep[wid]%NCCL_STEPS, nbytes);
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
}
|
||||
|
||||
inline __device__ void incRecv(int i) {
|
||||
recvStep[i] += 1;
|
||||
}
|
||||
inline __device__ void postRecv() {
|
||||
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
|
||||
}
|
||||
|
||||
inline __device__ void incSend(int i) {
|
||||
sendStep[i] += 1;
|
||||
}
|
||||
inline __device__ void postSend() {
|
||||
if (sendConnTailPtr) { __threadfence(); STORE(sendConnTailPtr, sendConnTail += 1); }
|
||||
}
|
||||
|
||||
template <int ELEMS_PER_THREAD>
|
||||
inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) {
|
||||
#if 0
|
||||
uint64_t v[ELEMS_PER_THREAD];
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
#else
|
||||
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) {
|
||||
uint64_t v0, v1;
|
||||
load128(src64Ptr+u*WARP_SIZE, v0, v1);
|
||||
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) {
|
||||
T* shmemPtr = (T*)(shmem-2*wid);
|
||||
for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
|
||||
shmemPtr[offset] = srcPtr[offset];
|
||||
}
|
||||
}
|
||||
|
||||
template <int ELEMS_PER_THREAD>
|
||||
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
|
||||
uint64_t v[ELEMS_PER_THREAD];
|
||||
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
}
|
||||
|
||||
inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) {
|
||||
T* shmemPtr = (T*)(shmem-2*wid);
|
||||
for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
|
||||
dstPtr[offset] = shmemPtr[offset];
|
||||
}
|
||||
}
|
||||
|
||||
#define WARP_MASK 0xffffffff
|
||||
|
||||
template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST>
|
||||
__device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) {
|
||||
uint64_t v[ELEMS_PER_THREAD];
|
||||
|
||||
/************* Data Loading : SHMEM -> REG **************/
|
||||
if (SRC) {
|
||||
volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
v[u] = shmem64Ptr[u*(WARP_SIZE-2)];
|
||||
if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1];
|
||||
}
|
||||
}
|
||||
/*********** End Data Loading : SHMEM -> REG ************/
|
||||
|
||||
/************************ Recv **************************/
|
||||
if (RECV) {
|
||||
uint64_t flag = recvFlag(0);
|
||||
uint64_t* ptr = recvPtr(0)+ll128Offset;
|
||||
bool needReload;
|
||||
uint64_t v0, v1;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(0, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
|
||||
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
|
||||
}
|
||||
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
uint64_t flag = recvFlag(i);
|
||||
uint64_t* ptr = recvPtr(i)+ll128Offset;
|
||||
uint64_t v0, v1;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(i, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = MULTI<FUNC, T>()(v0, v[u]);
|
||||
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
/********************** End Recv ************************/
|
||||
|
||||
/************************ Send **************************/
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) {
|
||||
int flag = sendFlag(i);
|
||||
uint64_t* ptr = sendPtr(i)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
}
|
||||
}
|
||||
int flag = sendFlag(0);
|
||||
uint64_t* ptr = sendPtr(0)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
}
|
||||
}
|
||||
/********************** End Send ************************/
|
||||
|
||||
/************* Data Storing : REG -> SHMEM **************/
|
||||
if (DST) {
|
||||
volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
shmem64Ptr[u*(WARP_SIZE-2)] = v[u];
|
||||
if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1];
|
||||
}
|
||||
}
|
||||
/*********** End data Storing : REG -> SHMEM ************/
|
||||
}
|
||||
|
||||
#define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD)
|
||||
#define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS))
|
||||
|
||||
template <int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) {
|
||||
if (nelem <= 0) {
|
||||
// Don't move any data but still increase steps and sync with prev/next
|
||||
if (SEND) waitSend(0);
|
||||
FOR_SEND(incSend); if (SEND) postSend();
|
||||
FOR_RECV(incRecv); if (RECV) postRecv();
|
||||
return;
|
||||
}
|
||||
const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2;
|
||||
const uint64_t* src64Ptr = ((uint64_t*)srcPtr);
|
||||
uint64_t* dst64Ptr = ((uint64_t*)dstPtr);
|
||||
|
||||
int ll128Offset = LL128INC*warp+2*wid;
|
||||
int elemOffset = ELEMINC*warp;
|
||||
const int nwarps = nthreads/WARP_SIZE;
|
||||
|
||||
if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t));
|
||||
barrier();
|
||||
|
||||
while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) {
|
||||
const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC);
|
||||
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
|
||||
if (SRC) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)srcPtr)&0xf) == 0) {
|
||||
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset));
|
||||
}
|
||||
__syncwarp();
|
||||
recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset);
|
||||
__syncwarp();
|
||||
if (DST) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)dstPtr)&0xf) == 0) {
|
||||
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset));
|
||||
}
|
||||
__syncwarp();
|
||||
ll128Offset += LL128INC*nwarps;
|
||||
elemOffset += ELEMINC*nwarps;
|
||||
}
|
||||
|
||||
barrier();
|
||||
FOR_SEND(incSend); if (SEND) postSend();
|
||||
FOR_RECV(incRecv); if (RECV) postRecv();
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
recvBuff[i] = LOAD(&conn->ll128Buff);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) recvConn = conn;
|
||||
nrecv++;
|
||||
}
|
||||
__device__ __forceinline__ void loadRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
recvConnHeadPtr = LOAD(&recvConn->head);
|
||||
recvConnHead = LOAD(&recvConn->step);
|
||||
// Update opCount in case we skipped some operations
|
||||
STORE(recvConn->opCountLoc, opCount);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendBuff[i] = LOAD(&conn->ll128Buff);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) sendConn = conn;
|
||||
nsend++;
|
||||
}
|
||||
__device__ __forceinline__ void loadSendSync() {
|
||||
if (tid < nsend) {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
STORE(sendConn->opCountLoc, opCount);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
|
||||
if (sendConn->fifo) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
sendConnTail = LOAD(&sendConn->step);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
STORE(&recvConn->step, recvConnHead);
|
||||
STORE(recvConn->opCountLoc, opCount+1);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveSendSync() {
|
||||
if (tid < nsend) {
|
||||
STORE(&sendConn->step, sendConnHead);
|
||||
STORE(sendConn->opCountLoc, opCount+1);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
|
||||
// for __any_sync
|
||||
if (NSEND > NRECV)
|
||||
sync = channel->sync + 2 + tid/WARP_SIZE;
|
||||
else
|
||||
sync = channel->sync + tid/WARP_SIZE;
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
}
|
||||
|
||||
__device__ void send(const T* src, int nelem) {
|
||||
return GenericOp<0, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recv(T* dst, int nelem) {
|
||||
return GenericOp<1, 0, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceSend(const T* src, int nelem) {
|
||||
return GenericOp<1, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
return GenericOp<1, 0, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void copySend(const T* src, T* dst, int nelem) {
|
||||
return GenericOp<0, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvCopySend(T* dst, int nelem) {
|
||||
return GenericOp<1, 1, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
return GenericOp<1, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ ~ncclLL128Primitives() {
|
||||
// Save steps for the next operation
|
||||
saveRecvSync();
|
||||
saveSendSync();
|
||||
}
|
||||
};
|
||||
@@ -1,15 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "reduce.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
|
||||
IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
|
||||
IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
|
||||
IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
|
||||
IMPL_COLL_R(ncclReduce, ncclCollReduce);
|
||||
|
||||
@@ -13,7 +13,7 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
@@ -32,7 +32,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
@@ -98,3 +98,50 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -260,15 +260,12 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
|
||||
template<>
|
||||
struct FuncSum<int8_t> {
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#else
|
||||
return addChar4(x, y);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -278,15 +275,12 @@ struct FuncSum<int8_t> {
|
||||
template<>
|
||||
struct FuncSum<uint8_t> {
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#else
|
||||
return addChar4(x, y);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
@@ -330,8 +324,6 @@ template<>
|
||||
struct FuncMax<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
@@ -345,7 +337,6 @@ struct FuncMax<int8_t> {
|
||||
cr.a.z = max(cx.a.z, cy.a.z);
|
||||
cr.a.w = max(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -356,8 +347,6 @@ template<>
|
||||
struct FuncMax<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
@@ -371,7 +360,6 @@ struct FuncMax<uint8_t> {
|
||||
cr.a.z = max(cx.a.z, cy.a.z);
|
||||
cr.a.w = max(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
@@ -383,8 +371,6 @@ template<>
|
||||
struct FuncMin<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
@@ -398,7 +384,6 @@ struct FuncMin<int8_t> {
|
||||
cr.a.z = min(cx.a.z, cy.a.z);
|
||||
cr.a.w = min(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -409,8 +394,6 @@ template<>
|
||||
struct FuncMin<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
@@ -424,7 +407,6 @@ struct FuncMin<uint8_t> {
|
||||
cr.a.z = min(cx.a.z, cy.a.z);
|
||||
cr.a.w = min(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
|
||||
@@ -1,15 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "reduce_scatter.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
|
||||
IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
|
||||
IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
|
||||
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
|
||||
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
|
||||
|
||||
@@ -13,7 +13,7 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
@@ -21,7 +21,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
@@ -29,7 +29,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
@@ -126,3 +126,66 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -0,0 +1,170 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "nccl_net.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
|
||||
FILE *ncclDebugFile = stdout;
|
||||
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) return;
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
|
||||
/* Parse the NCCL_DEBUG_SUBSYS env var
|
||||
* This can be a comma separated list such as INIT,COLL
|
||||
* or ^INIT,COLL etc
|
||||
*/
|
||||
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
|
||||
if (ncclDebugSubsysEnv != NULL) {
|
||||
int invert = 0;
|
||||
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
|
||||
ncclDebugMask = invert ? ~0ULL : 0ULL;
|
||||
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
|
||||
char *subsys = strtok(ncclDebugSubsys, ",");
|
||||
while (subsys != NULL) {
|
||||
uint64_t mask = 0;
|
||||
if (strcasecmp(subsys, "INIT") == 0) {
|
||||
mask = NCCL_INIT;
|
||||
} else if (strcasecmp(subsys, "COLL") == 0) {
|
||||
mask = NCCL_COLL;
|
||||
} else if (strcasecmp(subsys, "P2P") == 0) {
|
||||
mask = NCCL_P2P;
|
||||
} else if (strcasecmp(subsys, "SHM") == 0) {
|
||||
mask = NCCL_SHM;
|
||||
} else if (strcasecmp(subsys, "NET") == 0) {
|
||||
mask = NCCL_NET;
|
||||
} else if (strcasecmp(subsys, "GRAPH") == 0) {
|
||||
mask = NCCL_GRAPH;
|
||||
} else if (strcasecmp(subsys, "TUNING") == 0) {
|
||||
mask = NCCL_TUNING;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
if (mask) {
|
||||
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
|
||||
}
|
||||
subsys = strtok(NULL, ",");
|
||||
}
|
||||
free(ncclDebugSubsys);
|
||||
}
|
||||
|
||||
/* Parse and expand the NCCL_DEBUG_FILE path and
|
||||
* then create the debug file. But don't bother unless the
|
||||
* NCCL_DEBUG level is > VERSION
|
||||
*/
|
||||
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
|
||||
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX+1] = "";
|
||||
char *dfn = debugFn;
|
||||
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
|
||||
if (ncclDebugFileEnv[c++] != '%') {
|
||||
*dfn++ = ncclDebugFileEnv[c-1];
|
||||
continue;
|
||||
}
|
||||
switch (ncclDebugFileEnv[c++]) {
|
||||
case '%': // Double %
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
*dfn++ = ncclDebugFileEnv[c-1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
*dfn = '\0';
|
||||
if (debugFn[0] != '\0') {
|
||||
FILE *file = fopen(debugFn, "w");
|
||||
if (file != NULL) {
|
||||
INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
|
||||
ncclDebugFile = file;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
ncclEpoch = std::chrono::high_resolution_clock::now();
|
||||
#endif
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
/* Common logging function used by the INFO, WARN and TRACE macros
|
||||
* Also exported to the dynamically loadable Net transport modules so
|
||||
* they can share the debugging mechanisms and output files
|
||||
*/
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
|
||||
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
int cudaDev;
|
||||
hipGetDevice(&cudaDev);
|
||||
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
#endif
|
||||
if (len) {
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
fprintf(ncclDebugFile,"%s\n", buffer);
|
||||
fflush(ncclDebugFile);
|
||||
}
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
|
||||
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
|
||||
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
|
||||
hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
+104
-74
@@ -6,19 +6,17 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "checks.h"
|
||||
#include "param.h"
|
||||
|
||||
#include "collectives/collectives.h"
|
||||
#include "argcheck.h"
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype)
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
@@ -58,7 +56,7 @@
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclColl);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
|
||||
static ncclKern_t const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
@@ -209,6 +207,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
channel->collCount = 0;
|
||||
}
|
||||
params->gridDim.x = params->blockDim.x = 0;
|
||||
comm->lastOpCount = comm->opCount;
|
||||
NCCLCHECK(transportStartProxy(comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -230,20 +229,78 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
|
||||
/* Enqueueing system : computation of kernel and proxy operations parameters */
|
||||
/*****************************************************************************/
|
||||
|
||||
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
|
||||
else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
|
||||
else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
|
||||
else if (info->coll == ncclCollAllReduce) {
|
||||
if (info->nBytes <= info->comm->treeThreshold)
|
||||
info->pattern = ncclPatternTreeUpDown;
|
||||
else
|
||||
info->pattern = ncclPatternRingTwice;
|
||||
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
|
||||
// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
|
||||
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
|
||||
{ 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
|
||||
{ 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
|
||||
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
|
||||
};
|
||||
|
||||
static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
|
||||
// Find algorithm / protocol.
|
||||
info->algorithm = -1;
|
||||
info->protocol = -1;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float bw = comm->bandwidths[info->coll][a][p];
|
||||
if (bw == 0) continue;
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
|
||||
float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
|
||||
if (time < minTime) {
|
||||
info->algorithm = a;
|
||||
info->protocol = p;
|
||||
minTime = time;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
WARN("Unknown collective %d", info->coll);
|
||||
if (info->algorithm == -1 || info->protocol == -1) {
|
||||
WARN("Error : no algorithm/protocol available");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
|
||||
int nc = comm->nChannels;
|
||||
int nt = comm->maxThreads[info->protocol];
|
||||
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
||||
while (info->nBytes < nc*nt*threadThreshold) {
|
||||
if (nc >= 2) nc--;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// do not reduce threads count on VEGA
|
||||
#else
|
||||
else if ((nt % 128) == 0) nt/=2;
|
||||
#endif
|
||||
else break;
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
|
||||
#endif
|
||||
info->nChannels = nc;
|
||||
info->nThreads = nt;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
switch (info->coll) {
|
||||
case ncclCollBroadcast:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
|
||||
case ncclCollReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
|
||||
case ncclCollReduceScatter:
|
||||
case ncclCollAllGather:
|
||||
info->pattern = ncclPatternRing; break;
|
||||
case ncclCollAllReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
|
||||
default:
|
||||
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -266,40 +323,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
|
||||
// Compute thresholds and limits that users can override
|
||||
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
|
||||
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
|
||||
|
||||
// First compute nThreads
|
||||
int nt = NCCL_LL_MIN_NTHREADS;
|
||||
while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
|
||||
|
||||
// Then compute nChannels
|
||||
int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
|
||||
if (nc == 0) nc = 1;
|
||||
if (nc > info->comm->nChannels) nc = info->comm->nChannels;
|
||||
|
||||
// Check if we have a fixed LL threshold, otherwise compute it.
|
||||
int perThreadThreshold = info->comm->threadThreshold;
|
||||
if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
|
||||
ssize_t llThreshold = info->comm->llThreshold >= 0 ?
|
||||
info->comm->llThreshold :
|
||||
nc*nt*info->nchunksPerLoop*perThreadThreshold;
|
||||
|
||||
if (info->nBytes <= llThreshold) {
|
||||
*llMode = 1;
|
||||
*nChannels = nc;
|
||||
*nThreads = nt;
|
||||
} else {
|
||||
*llMode = 0;
|
||||
*nChannels = info->comm->nChannels;
|
||||
*nThreads = info->comm->nThreads;
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
// Set nstepsPerLoop and nchunksPerLoop
|
||||
NCCLCHECK(getAlgoInfo(info));
|
||||
NCCLCHECK(getPatternInfo(info));
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
@@ -309,48 +335,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
coll->args.ThisOutput = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
coll->args.nChannels = info->nChannels;
|
||||
coll->args.nThreads = info->nThreads;
|
||||
|
||||
// Compute llMode, nChannels, nThreads
|
||||
int llMode;
|
||||
getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
|
||||
int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
|
||||
|
||||
int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
|
||||
int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
|
||||
int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
|
||||
int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
|
||||
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
|
||||
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
|
||||
int chunkSize = stepSize*chunkSteps;
|
||||
|
||||
// Compute lastChunkSize
|
||||
if (treeMode == 1 && llMode == 0) {
|
||||
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (info->pattern == ncclPatternTreeUpDown) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (llMode == 1) {
|
||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
|
||||
const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
|
||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
|
||||
int nstepsInter = 1+log2i(info->comm->nNodes);
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
}
|
||||
|
||||
// Compute nSteps for proxies
|
||||
size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
|
||||
|
||||
int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
|
||||
int chunkEffectiveSize = chunkSize;
|
||||
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
|
||||
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
|
||||
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
||||
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
|
||||
proxyArgs->sliceSteps = sliceSteps;
|
||||
proxyArgs->chunkSteps = chunkSteps;
|
||||
proxyArgs->llMode = llMode;
|
||||
proxyArgs->protocol = info->protocol;
|
||||
proxyArgs->opCount = info->comm->opCount;
|
||||
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
|
||||
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
nLoops, proxyArgs->nsteps, info->comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -403,7 +433,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
}
|
||||
/*if (llMode == 0)*/ info->comm->opCount++;
|
||||
info->comm->opCount++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "graph.h"
|
||||
#include "trees.h"
|
||||
#include "rings.h"
|
||||
|
||||
/******************************************************************/
|
||||
/********************* Internode connection ***********************/
|
||||
/******************************************************************/
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
|
||||
struct ncclTopoRanks* topoRanks) {
|
||||
int rank = comm->rank;
|
||||
int localRanks = comm->localRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
channel->treeUp.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
|
||||
channel->treeDn.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
|
||||
|
||||
int* ringIntra = ringGraph->intra+c*localRanks;
|
||||
int* treeIntra = treeGraph->intra+c*localRanks;
|
||||
|
||||
for (int i=0; i<localRanks; i++) {
|
||||
if (ringIntra[i] == rank) {
|
||||
topoRanks->ringRecv[c] = ringIntra[0];
|
||||
topoRanks->ringSend[c] = ringIntra[localRanks-1];
|
||||
channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
|
||||
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
}
|
||||
if (treeIntra[i] == rank) {
|
||||
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
|
||||
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
|
||||
// up/down go in reverse directions
|
||||
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
|
||||
|
||||
// Down tree is common
|
||||
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
|
||||
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
|
||||
channel->treeDn.up = treeIntra[prev];
|
||||
channel->treeDn.down[0] = treeIntra[next];
|
||||
// Up tree depends on the pattern
|
||||
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
|
||||
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
|
||||
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
|
||||
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
topoRanks->ringNext[c] = channel->ring.next;
|
||||
}
|
||||
// Duplicate channels rings/trees
|
||||
struct ncclChannel* channel0 = comm->channels;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
|
||||
int nChannels = comm->nChannels;
|
||||
int nNodes = comm->nNodes;
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
int* recv = ringRecv+c*comm->nRanks;
|
||||
int* send = ringSend+c*comm->nRanks;
|
||||
int* prev = ringPrev+c*comm->nRanks;
|
||||
int* next = ringNext+c*comm->nRanks;
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
int recvRank = recv[firstRanks[n]];
|
||||
int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
|
||||
prev[recvRank] = prevSendRank;
|
||||
if (comm->rank == recvRank) {
|
||||
channel0->ring.prev = prevSendRank;
|
||||
channel1->ring.prev = prevSendRank;
|
||||
}
|
||||
int sendRank = send[firstRanks[n]];
|
||||
int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
|
||||
next[sendRank] = nextRecvRank;
|
||||
if (comm->rank == sendRank) {
|
||||
channel0->ring.next = nextRecvRank;
|
||||
channel1->ring.next = nextRecvRank;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
|
||||
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
|
||||
for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
|
||||
if (u0 != -1) tree0->up = indexes[u0];
|
||||
if (u1 != -1) tree1->up = indexes[u1];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
|
||||
int x = 0;
|
||||
if (down[x] >= 0) x++;
|
||||
if (down[x] >= 0) {
|
||||
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (r0 != -1) down[x++] = indexes[r0];
|
||||
if (r1 != -1) down[x++] = indexes[r1];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
|
||||
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
|
||||
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
|
||||
if (tree->down[0] == upRank) tree->down[0] = -1;
|
||||
if (rank == upRank) tree->up = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
|
||||
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
|
||||
int* indexesSend, *indexesRecv;
|
||||
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
|
||||
|
||||
// Compute tree depth. Not an exact value but a good approximation in most
|
||||
// cases
|
||||
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
|
||||
|
||||
int u0, d0_0, d0_1, u1, d1_0, d1_1;
|
||||
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
|
||||
int root = indexesSend[node];
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
|
||||
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
|
||||
channel0->treeUp.depth = channel1->treeUp.depth = depth;
|
||||
}
|
||||
free(indexesSend);
|
||||
free(indexesRecv);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Legacy naming
|
||||
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
|
||||
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
|
||||
// New naming
|
||||
NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
|
||||
NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
|
||||
|
||||
int ncclMinNchannels() {
|
||||
int minNchannels = 0;
|
||||
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
|
||||
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
|
||||
if (minNchannels > MAXCHANNELS) {
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
|
||||
minNchannels = MAXCHANNELS;
|
||||
}
|
||||
if (minNchannels < 0) minNchannels = 0;
|
||||
return minNchannels;
|
||||
}
|
||||
int ncclMaxNchannels() {
|
||||
int maxNchannels = MAXCHANNELS;
|
||||
if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
|
||||
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
|
||||
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
|
||||
if (maxNchannels < 1) {
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
|
||||
maxNchannels = 1;
|
||||
}
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
|
||||
int nranks = comm->nRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
for (int c=0; c<nChannels;c++) {
|
||||
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
|
||||
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
|
||||
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
|
||||
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
|
||||
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
|
||||
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
|
||||
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
|
||||
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
|
||||
}
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
|
||||
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
|
||||
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
|
||||
|
||||
// Duplication should be complete now
|
||||
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
|
||||
|
||||
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
|
||||
// We permit combining max, then min, to only use the first channels, then duplicate them.
|
||||
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
|
||||
int c;
|
||||
for (c=nChannels; c<ncclMinNchannels(); c++) {
|
||||
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
|
||||
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
|
||||
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
|
||||
}
|
||||
nChannels = comm->nChannels = c;
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
|
||||
free(ringRecv);
|
||||
free(ringSend);
|
||||
free(ringPrev);
|
||||
free(ringNext);
|
||||
free(treeUpRecv);
|
||||
free(treeUpSend);
|
||||
free(treeDnRecv);
|
||||
free(treeDnSend);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
|
||||
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
|
||||
|
||||
struct ncclTopoNodeList {
|
||||
struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
|
||||
int count;
|
||||
};
|
||||
|
||||
static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
|
||||
for (int i=0; i<system->nodes[t].count; i++) {
|
||||
if (system->nodes[t].nodes[i].id == id) {
|
||||
*path = node->paths[t]+i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find node of type %d id %lx\n", t, id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
|
||||
// breadth-first search to set all paths to that node in the system
|
||||
struct ncclTopoNodeList nodeList;
|
||||
struct ncclTopoNodeList nextNodeList;
|
||||
nodeList.count = 1; nodeList.list[0] = baseNode;
|
||||
nextNodeList.count = 0;
|
||||
struct ncclTopoLinkList* basePath;
|
||||
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
||||
basePath->count = 0;
|
||||
basePath->width = LOC_WIDTH;
|
||||
basePath->type = LINK_LOC;
|
||||
|
||||
while (nodeList.count) {
|
||||
nextNodeList.count = 0;
|
||||
for (int n=0; n<nodeList.count; n++) {
|
||||
struct ncclTopoNode* node = nodeList.list[n];
|
||||
struct ncclTopoLinkList* path;
|
||||
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
int width = std::min(path->width, link->width);
|
||||
if (remPath->width < width) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
remPath->list[0] = remNode->links+l;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (remPath->list[0] == NULL) {
|
||||
WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
|
||||
remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Copy the rest of the path
|
||||
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
|
||||
remPath->count = path->count + 1;
|
||||
remPath->width = width;
|
||||
|
||||
// Consider the path is QPI when going through the CPU
|
||||
// Also don't consider LINK_NET as we only care about the NIC->GPU path.
|
||||
int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
|
||||
remPath->type = std::max(path->type, type);
|
||||
|
||||
// Add to the list for the next iteration if not already in the list
|
||||
// Disallow GPUs as intermediate steps for now
|
||||
if (remNode->type != GPU) {
|
||||
int i;
|
||||
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
|
||||
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
|
||||
char line[1024];
|
||||
#ifdef ENABLE_TRACE
|
||||
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
#else
|
||||
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
int offset = strlen(line);
|
||||
#endif
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
if (node->paths[t] == NULL) continue;
|
||||
for (int n = 0; n<system->nodes[t].count; n++) {
|
||||
#ifdef ENABLE_TRACE
|
||||
line[0] = 0;
|
||||
int offset = 0;
|
||||
for (int i=0; i<node->paths[t][n].count; i++) {
|
||||
struct ncclTopoLink* link = node->paths[t][n].list[i];
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
|
||||
#else
|
||||
sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
|
||||
offset = strlen(line);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifndef ENABLE_TRACE
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
printNodePaths(system, system->nodes[GPU].nodes+i);
|
||||
}
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
printNodePaths(system, system->nodes[NET].nodes+i);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
|
||||
// Find the closest CPU to a GPU
|
||||
int minHops = 0;
|
||||
int localCpu = -1;
|
||||
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
int hops = paths[c].count;
|
||||
if (minHops == 0 || hops < minHops) {
|
||||
localCpu = c;
|
||||
minHops = hops;
|
||||
}
|
||||
}
|
||||
if (localCpu == -1) {
|
||||
WARN("Error : could not find CPU close to GPU %d", gpu);
|
||||
return ncclInternalError;
|
||||
}
|
||||
*retCpu = localCpu;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
|
||||
struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
|
||||
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
|
||||
|
||||
int l=0;
|
||||
// Node 1 -> CPU
|
||||
for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
|
||||
// CPU -> Node 2
|
||||
for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
|
||||
|
||||
// Update path characteristics
|
||||
srcNode->paths[t2][i2].count = l;
|
||||
srcNode->paths[t2][i2].type = LINK_QPI;
|
||||
srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Remove/free paths for a given type
|
||||
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
for (int n=0; n<system->nodes[t].count; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[t].nodes+n;
|
||||
free(node->paths[nodeType]);
|
||||
node->paths[nodeType] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
|
||||
// Precompute paths between GPUs/NICs.
|
||||
|
||||
// Remove everything in case we're re-computing
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
|
||||
// Set direct paths from/to CPUs. We need them in many cases.
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
|
||||
}
|
||||
|
||||
// Set direct paths from/to GPUs.
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Compute paths to GPU g
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
|
||||
|
||||
if (peerInfos == NULL) continue;
|
||||
// Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
|
||||
struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
if (p == g) continue;
|
||||
struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
|
||||
if (p2p == 0) {
|
||||
int shm;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
if (shm == 1) {
|
||||
// We cannot use GPU Direct, so we need all traffic to go through a CPU
|
||||
int cpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &cpu));
|
||||
NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
|
||||
} else {
|
||||
// We cannot communicate with that peer.
|
||||
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set direct paths from/to NICs.
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
|
||||
NCCLCHECK(ncclTopoSetPaths(netNode, system));
|
||||
|
||||
if (peerInfos == NULL) continue;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
|
||||
// We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
|
||||
// to go through a CPU
|
||||
int localCpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &localCpu));
|
||||
NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
|
||||
NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
||||
int *domains;
|
||||
int64_t *ids;
|
||||
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
|
||||
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
|
||||
int myDomain = 0;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
domains[g] = g;
|
||||
ids[g] = gpu->id;
|
||||
for (int p=0; p<g; p++) {
|
||||
if (gpu->paths[GPU][p].count > 0) {
|
||||
domains[g] = std::min(domains[g], domains[p]);
|
||||
}
|
||||
}
|
||||
if (gpu->rank == comm->rank) myDomain = domains[g];
|
||||
}
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (domains[i] == myDomain) continue;
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
int g;
|
||||
for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
if (gpu->id == ids[i]) break; else gpu=NULL;
|
||||
}
|
||||
if (gpu == NULL) {
|
||||
WARN("Could not find id %lx", ids[i]);
|
||||
free(domains);
|
||||
free(ids);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Remove GPUs I can't access (even indirectly) from my view of the node
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
for (int n=0; n<system->nodes[t].count; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[t].nodes+n;
|
||||
if (node == gpu) continue;
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
while (l<node->nlinks && node->links[l].remNode == gpu) {
|
||||
if (l<node->nlinks-1)
|
||||
memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
|
||||
node->nlinks--;
|
||||
}
|
||||
if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
|
||||
node->links[l].remNode--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (g != system->nodes[GPU].count-1)
|
||||
memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
|
||||
system->nodes[GPU].count--;
|
||||
}
|
||||
|
||||
comm->localRanks = system->nodes[GPU].count;
|
||||
if (system->nodes[GPU].count == comm->nRanks) {
|
||||
// Trim network
|
||||
ncclTopoRemovePathType(system, NET);
|
||||
system->nodes[NET].count = 0;
|
||||
}
|
||||
free(domains);
|
||||
free(ids);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
|
||||
int nvlSpeed = 0;
|
||||
int nvlPeers = 0;
|
||||
int pciSpeed = 0;
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
|
||||
if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
|
||||
if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
|
||||
}
|
||||
*speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
|
||||
// Compute max speed to try to accelerate the search.
|
||||
system->maxSpeed = LOC_WIDTH;
|
||||
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
|
||||
}
|
||||
if (system->nodes[NET].count) {
|
||||
// Try to assign one NIC per GPU
|
||||
int netMaxSpeed = 0;
|
||||
int netMaxSpeedCount = 0;
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
int maxSpeed = 0;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
|
||||
}
|
||||
if (maxSpeed > netMaxSpeed) {
|
||||
netMaxSpeed = maxSpeed;
|
||||
netMaxSpeedCount = 1;
|
||||
} else if (maxSpeed == netMaxSpeed) {
|
||||
netMaxSpeedCount++;
|
||||
}
|
||||
}
|
||||
system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void ncclTopoFree(struct ncclTopoSystem* system) {
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
free(system);
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
|
||||
#define MAXWIDTH 20
|
||||
#define PREFIXLEN 15
|
||||
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
|
||||
void dumpLine(int* values, int nranks, const char* prefix) {
|
||||
int prefixlen = strlen(prefix);
|
||||
char line[STRLENGTH+1];
|
||||
line[STRLENGTH] = '\0';
|
||||
memset(line, ' ', STRLENGTH);
|
||||
strncpy(line, prefix, PREFIXLEN);
|
||||
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
}
|
||||
|
||||
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||
for (int r=0; r<nrings; r++) {
|
||||
char prefix[30];
|
||||
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
|
||||
dumpLine(prev+r*nranks, nranks, prefix);
|
||||
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
|
||||
dumpLine(next+r*nranks, nranks, prefix);*/
|
||||
|
||||
int current = rank;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
rings[r*nranks+i] = current;
|
||||
current = next[r*nranks+current];
|
||||
}
|
||||
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
|
||||
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
|
||||
if (current != rank) {
|
||||
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Check that all ranks are there
|
||||
for (int i=0; i<nranks; i++) {
|
||||
int found = 0;
|
||||
for (int j=0; j<nranks; j++) {
|
||||
if (rings[r*nranks+j] == i) {
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found == 0) {
|
||||
WARN("Error : ring %d does not contain rank %d", r, i);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
|
||||
@@ -0,0 +1,594 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
|
||||
static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
|
||||
if (path->count == 0) return ncclSuccess;
|
||||
|
||||
*node = NULL;
|
||||
if (width > 0) {
|
||||
if (path->type > graph->type) return ncclSuccess;
|
||||
graph->type = std::max(graph->type, path->type);
|
||||
graph->nHops += path->count;
|
||||
} else {
|
||||
graph->type = typeSave;
|
||||
graph->nHops -= path->count;
|
||||
}
|
||||
|
||||
for (int i=0; i<path->count; i++) {
|
||||
if (path->list[i]->width < width) {
|
||||
// Can't follow this path, rewind and exit
|
||||
for (int j=0; j<i; j++) path->list[j]->width += width;
|
||||
return ncclSuccess;
|
||||
}
|
||||
path->list[i]->width -= width;
|
||||
}
|
||||
*node = path->list[path->count-1]->remNode;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int gpuPciWidth(struct ncclTopoNode* gpu) {
|
||||
for (int l=0; l<gpu->nlinks; l++) {
|
||||
struct ncclTopoLink* gpuLink = gpu->links+l;
|
||||
if (gpuLink->type != LINK_PCI) continue;
|
||||
struct ncclTopoNode* pci = gpuLink->remNode;
|
||||
for (int l=0; l<pci->nlinks; l++) {
|
||||
struct ncclTopoLink* pciLink = pci->links+l;
|
||||
if (pciLink->remNode != gpu) continue;
|
||||
return std::min(gpuLink->width, pciLink->width);
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Choose the order in which we try next GPUs. This is critical for the search
|
||||
to quickly converge to the best solution even if it eventually times out. */
|
||||
struct ncclGpuScore {
|
||||
int g; // Retain the index
|
||||
int startIndex; // Least important
|
||||
int intraNhops;
|
||||
int intraWidth;
|
||||
int interNhops;
|
||||
int interPciWidth;
|
||||
int interWidth; // Most important
|
||||
};
|
||||
|
||||
static int cmpScore(const void * g1, const void * g2) {
|
||||
struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
|
||||
struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
|
||||
int d;
|
||||
if ((d = (s2->interWidth - s1->interWidth))) return d;
|
||||
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
|
||||
if ((d = (s1->interNhops - s2->interNhops))) return d;
|
||||
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
|
||||
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
|
||||
return s1->startIndex - s2->startIndex;
|
||||
}
|
||||
|
||||
static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
|
||||
int intraWidth = scores[0].intraWidth;
|
||||
int intraNhops = scores[0].intraNhops;
|
||||
for (int i=1; i<count; i++) {
|
||||
if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
if (system->nodes[NET].nodes[n].used & flag) {
|
||||
*netPaths=system->nodes[NET].nodes[n].paths[GPU];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
|
||||
const uint64_t flag = 1ULL<<(graph->nChannels);
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
struct ncclTopoLinkList* paths = gpu->paths[GPU];
|
||||
struct ncclTopoLinkList* netPaths = NULL;
|
||||
if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
|
||||
|
||||
struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
|
||||
memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
|
||||
int start = gpu-system->nodes[GPU].nodes;
|
||||
int count = 0;
|
||||
for (int i=1; i<ngpus; i++) {
|
||||
int g = (start+i)%ngpus;
|
||||
if (paths[g].count == 0) continue; // There is no path to that GPU
|
||||
if (system->nodes[GPU].nodes[g].used & flag) continue;
|
||||
scores[count].g = g;
|
||||
scores[count].startIndex = i;
|
||||
scores[count].intraNhops = paths[g].count;
|
||||
scores[count].intraWidth = paths[g].width;
|
||||
if (netPaths) {
|
||||
scores[count].interNhops = netPaths[g].count;
|
||||
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
|
||||
scores[count].interWidth = netPaths[g].width;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
// Sort GPUs
|
||||
qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore);
|
||||
|
||||
// Check if all have the same intra-node score in which case we go reverse for sortNet = -1
|
||||
if (sortNet == -1 && cmpIntraScores(scores, count) == 0) {
|
||||
for (int i=0; i<count; i++) next[i] = scores[count-1-i].g;
|
||||
} else {
|
||||
for (int i=0; i<count; i++) next[i] = scores[i].g;
|
||||
}
|
||||
*countPtr = count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
|
||||
|
||||
#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
|
||||
|
||||
#define FORCED_ORDER_PCI 1
|
||||
#define FORCED_ORDER_REPLAY 2
|
||||
|
||||
ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
|
||||
*g = -1;
|
||||
if (graph->nChannels == 0) return ncclInternalError;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
|
||||
for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
|
||||
*g = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (*g == -1) return ncclInternalError;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
|
||||
|
||||
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
|
||||
int typeSave = graph->type;
|
||||
const uint64_t flag = 1ULL<<(graph->nChannels);
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
|
||||
if (gpu) {
|
||||
gpu->used ^= flag;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
|
||||
gpu->used ^= flag;
|
||||
if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
|
||||
// 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
|
||||
// since it would likely impact the rings algorithms too.
|
||||
if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
|
||||
|
||||
// 1. Try to get better bandwidth
|
||||
if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
|
||||
if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
// 2. Give an advantage when all channels are the same
|
||||
if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
// 3. Less hops
|
||||
if (graph->nHops < refGraph->nHops) *copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
|
||||
if ((*time) <= 0) return ncclSuccess;
|
||||
(*time)--;
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
if (step == ngpus) {
|
||||
// Determine whether we found a better solution or not
|
||||
int copy = 0;
|
||||
int sameChannels = graph->sameChannels;
|
||||
if (graph->nChannels > 0) {
|
||||
int* intra = graph->intra+graph->nChannels*ngpus;
|
||||
for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
|
||||
}
|
||||
graph->nChannels++;
|
||||
NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©));
|
||||
if (copy) {
|
||||
memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
|
||||
}
|
||||
if (graph->nChannels < MAXCHANNELS/2) {
|
||||
NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
|
||||
}
|
||||
graph->nChannels--;
|
||||
graph->sameChannels = sameChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
|
||||
if (step == backToNet) {
|
||||
// first get back to NIC
|
||||
if (system->nodes[NET].count) {
|
||||
int maxWidth = 0;
|
||||
struct ncclTopoLinkList* paths = gpu->paths[NET];
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
|
||||
maxWidth = std::max(paths[n].width, maxWidth);
|
||||
}
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
|
||||
if (paths[n].width == maxWidth) {
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
int typeSave = graph->type;
|
||||
NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
|
||||
NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (step < system->nodes[GPU].count-1) {
|
||||
// Go to next GPU
|
||||
struct ncclTopoLinkList* paths = gpu->paths[GPU];
|
||||
int next[NCCL_TOPO_MAX_NODES];
|
||||
int count;
|
||||
if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
|
||||
next[0] = step+1;
|
||||
count = 1;
|
||||
} else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
|
||||
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
|
||||
count = 1;
|
||||
} else { // Normal search
|
||||
NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
|
||||
}
|
||||
for (int i=0; i<count; i++) {
|
||||
int g = next[i];
|
||||
int nvlink = graph->nvlink;
|
||||
graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
|
||||
int speed = graph->speedIntra;
|
||||
if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
|
||||
graph->nvlink = nvlink;
|
||||
}
|
||||
} else if (step == backToFirstRank) {
|
||||
// Find first GPU and loop back to it
|
||||
int g;
|
||||
int rank = graph->intra[graph->nChannels*ngpus];
|
||||
for (g=0; g<ngpus; g++) {
|
||||
if (system->nodes[GPU].nodes[g].rank == rank) break;
|
||||
}
|
||||
if (g == ngpus) {
|
||||
WARN("Could not find GPU with rank %d\n", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclTopoLinkList* paths = gpu->paths[GPU];
|
||||
struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
|
||||
int typeSave = graph->type;
|
||||
NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
|
||||
if (firstGpu) {
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
|
||||
NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
|
||||
}
|
||||
} else {
|
||||
// Next path
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
|
||||
const uint64_t flag = 1ULL<<(graph->nChannels);
|
||||
const int speed = graph->speedInter;
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoNode* gpu;
|
||||
if (net->used == 0) {
|
||||
graph->inter[graph->nChannels*2] = net->id;
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
|
||||
}
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
|
||||
// First try the PCI order to set a reference
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
|
||||
// Then try to replay the last channel
|
||||
if (graph->nChannels > 0) {
|
||||
int g;
|
||||
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
int maxWidth = 0, minHops = 0xfffffff;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].width > maxWidth) {
|
||||
maxWidth = paths[g].width;
|
||||
minHops = paths[g].count;
|
||||
} else if (paths[g].width == maxWidth && paths[g].count < minHops) {
|
||||
minHops = paths[g].count;
|
||||
}
|
||||
}
|
||||
if (maxWidth >= speed) {
|
||||
// In the first loop, avoid using GPUs in both directions between channels (one channel
|
||||
// sending from that GPU and one channel receiving to that GPU), since that usually leads
|
||||
// to lower BW.
|
||||
for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].width == maxWidth && paths[g].count == minHops) {
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
|
||||
if (tryGpuBidir == gpuUsed) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Search Patterns
|
||||
*
|
||||
* Intra-node
|
||||
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
|
||||
* (=Split Tree Loop)
|
||||
* Tree : GPU a -> GPU b -> .. -> GPU x
|
||||
* (=Split Tree)
|
||||
*
|
||||
* Inter-node
|
||||
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
|
||||
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
|
||||
* `--> NET n (or m if crossNic)
|
||||
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
|
||||
* `--> NET n (or m if crossNic)
|
||||
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
|
||||
* `--> NET n (or m if crossNic)
|
||||
*/
|
||||
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
|
||||
if (system->nodes[NET].count) {
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
|
||||
else *backToNet = 1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
} else {
|
||||
*backToNet = -1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
|
||||
int backToNet, backToFirstRank;
|
||||
NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
|
||||
if (system->nodes[NET].count) {
|
||||
// Start from NET
|
||||
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
|
||||
} else {
|
||||
// Start from GPU 0
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
|
||||
if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
|
||||
* Rings with a non-matching number of ranks are ignored so we can provide
|
||||
* rings for multiple cases.
|
||||
*/
|
||||
#define MAX_ENV_RANKS 512
|
||||
static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
|
||||
int ranks[MAX_ENV_RANKS];
|
||||
int nChannels = 0;
|
||||
int rank = 0;
|
||||
int offset = 0;
|
||||
int status = 0; // 0 : between numbers, 1 : inside number
|
||||
do {
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (status == 0) {
|
||||
ranks[rank] = digit;
|
||||
status = 1;
|
||||
} else {
|
||||
ranks[rank] = ranks[rank]*10+digit;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
rank++;
|
||||
if (rank == MAX_ENV_RANKS) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
// Ignore if ngpus doesn't match
|
||||
if (rank != ngpus) goto newchannel;
|
||||
|
||||
for (int r=0; r<ngpus; r++) {
|
||||
int rank = ranks[r];
|
||||
// Ignore if ranks are out of bounds
|
||||
if (rank < 0 || rank >= ngpus) goto newchannel;
|
||||
// Ignore if ranks are duplicate
|
||||
for (int i=0; i<r; i++)
|
||||
if (ranks[i] == rank) goto newchannel;
|
||||
|
||||
channels[nChannels*ngpus+r] = rank;
|
||||
}
|
||||
nChannels++;
|
||||
newchannel:
|
||||
rank = 0;
|
||||
}
|
||||
}
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
*nChannelsRet = nChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
|
||||
graph->speedIntra = graph->speedInter = 0;
|
||||
if (graph->crossNic == 2) graph->crossNic = 0;
|
||||
graph->nvlink = 0;
|
||||
graph->type = LINK_LOC;
|
||||
graph->nChannels = 0;
|
||||
graph->sameChannels = 1;
|
||||
|
||||
char* str = getenv("NCCL_GRAPH");
|
||||
if (str) {
|
||||
NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
|
||||
for (int i=0; i<graph->nChannels*ngpus; i++) {
|
||||
// Translate gpu numbers into ranks
|
||||
graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
|
||||
}
|
||||
// TODO : let user specify NICs
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
|
||||
graph->nvlink = 0;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
|
||||
// Reverse the loop
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
for (int i=0; i<=ngpus/2; i++) {
|
||||
int tmp = graph->intra[ngpus*c+i];
|
||||
graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
|
||||
graph->intra[ngpus*c+ngpus-i] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
}
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
int bestSpeed = 0;
|
||||
|
||||
// First try crossnic, then decrease speed and finally increase speedIntra.
|
||||
tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
|
||||
int maxSpeed = system->maxSpeed;
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
|
||||
search:
|
||||
int time = NCCL_SEARCH_TIMEOUT;
|
||||
tmpGraph.nvlink = 1;
|
||||
tmpGraph.nChannels = 0;
|
||||
tmpGraph.sameChannels = 1;
|
||||
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
|
||||
#if 0
|
||||
printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
printf("%2d : ", c);
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
printf("%d ", graph->intra[c*ngpus+g]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
if (time == -1) goto done;
|
||||
// We already have a solution and we timed out so lower speed will just timeout as well
|
||||
if (time == 0 && graph->nChannels > 0) goto done;
|
||||
if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
|
||||
|
||||
if (tmpGraph.speedIntra == tmpGraph.speedInter) {
|
||||
// First pass, we don't have a solution yet ; try to go slower.
|
||||
|
||||
// Try a simpler tree
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
goto search;
|
||||
}
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
|
||||
if (tmpGraph.type < LINK_QPI) {
|
||||
tmpGraph.type += 1;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.type = graph->type;
|
||||
|
||||
if (crossNic && tmpGraph.crossNic == 0) {
|
||||
// Try again with crossNic if permitted
|
||||
tmpGraph.crossNic = crossNic;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.crossNic = graph->crossNic;
|
||||
|
||||
// Try to reduce speed per channel
|
||||
tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
|
||||
if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
|
||||
}
|
||||
|
||||
done:
|
||||
// We have a solution now. See if we can increase speedIntra
|
||||
if (tmpGraph.speedIntra == tmpGraph.speedInter) {
|
||||
time = -1;
|
||||
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
|
||||
}
|
||||
if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
|
||||
// Try to increase the intra speed only but keeping nChannels the same
|
||||
tmpGraph.speedIntra += 3;
|
||||
maxSpeed = tmpGraph.speedIntra * graph->nChannels;
|
||||
if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
|
||||
}
|
||||
|
||||
if (graph->nChannels == 0) {
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
|
||||
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->speedIntra = graph->speedInter = 3;
|
||||
graph->nvlink = 0;
|
||||
graph->nChannels = 1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
|
||||
char line[1024];
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
sprintf(line, "%2d :", c);
|
||||
int offset = strlen(line);
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
|
||||
*dev = graph->inter[(channelId%graph->nChannels)*2+dir];
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,678 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
#include "comm.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "net.h"
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" };
|
||||
#else
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
/******************* Graph Creation Functions *********************/
|
||||
/******************************************************************/
|
||||
static int getNumaId(char *path) {
|
||||
char npath[PATH_MAX];
|
||||
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||
npath[PATH_MAX-1] = '\0';
|
||||
|
||||
int numaId = -1;
|
||||
FILE *file = fopen(npath, "r");
|
||||
if (file == NULL) return -1;
|
||||
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
|
||||
fclose(file);
|
||||
|
||||
return numaId;
|
||||
}
|
||||
|
||||
static ncclResult_t getPciPath(char* busId, char** path) {
|
||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||
*path = realpath(busPath, NULL);
|
||||
if (*path == NULL) {
|
||||
WARN("Could not find real path of %s", busPath);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
|
||||
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
|
||||
char* str = path+offset;
|
||||
// Remove trailing "/"
|
||||
if (*str == '/') str--;
|
||||
// Find next /
|
||||
while (*str != '/') str--;
|
||||
str++;
|
||||
NCCLCHECK(busIdToInt64(str, id));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (system->nodes[GPU].nodes[i].id == id) {
|
||||
*index = i;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
static ncclResult_t getPath(int64_t id, char** path) {
|
||||
char busId[] = "0000:00:00.0";
|
||||
NCCLCHECK(int64ToBusId(id, busId));
|
||||
NCCLCHECK(getPciPath(busId, path));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
|
||||
char busId[BUSID_SIZE];
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||
NCCLCHECK(getPciPath(busId, path));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
int interCpuWidth = 0;
|
||||
int cpuPciWidth = 0;
|
||||
|
||||
static ncclResult_t getCpuWidths() {
|
||||
// Check if already detected
|
||||
if (interCpuWidth + cpuPciWidth) return ncclSuccess;
|
||||
|
||||
// Defaults
|
||||
char cpu[256];
|
||||
sprintf(cpu, "Generic");
|
||||
cpuPciWidth = interCpuWidth = PCI_WIDTH;
|
||||
|
||||
#ifdef __PPC__
|
||||
sprintf(cpu, "ppc64");
|
||||
interCpuWidth = P9_WIDTH;
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
sprintf(cpu, "x86_64");
|
||||
union {
|
||||
struct {
|
||||
// CPUID 0 String register order
|
||||
uint32_t ebx;
|
||||
uint32_t edx;
|
||||
uint32_t ecx;
|
||||
};
|
||||
char vendor[12];
|
||||
} cpuid0;
|
||||
|
||||
asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
|
||||
if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
|
||||
|
||||
if (strcmp(cpu, "Intel") == 0) {
|
||||
union {
|
||||
struct {
|
||||
int steppingId:4;
|
||||
int model:4;
|
||||
int familyId:4;
|
||||
int processorType:2;
|
||||
int resv0:2;
|
||||
int extModelId:4;
|
||||
int modelId:8;
|
||||
int resv1:4;
|
||||
};
|
||||
uint32_t val;
|
||||
} cpuid1;
|
||||
asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
|
||||
if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
|
||||
sprintf(cpu, "Intel/Skylake (or later)");
|
||||
interCpuWidth = SKL_QPI_WIDTH;
|
||||
} else {
|
||||
interCpuWidth = QPI_WIDTH;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
|
||||
NCCLCHECK(getCpuWidths());
|
||||
*width = interCpuWidth;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
|
||||
NCCLCHECK(getCpuWidths());
|
||||
*width = cpuPciWidth;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t ncclTopoGetPciWidth(int* width) {
|
||||
*width = PCI_WIDTH;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t ncclTopoGetNetWidth(int* width) {
|
||||
*width = NET_WIDTH;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceUnknown,
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
||||
char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
|
||||
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
||||
char* rPath = realpath(classPath, NULL);
|
||||
int fd;
|
||||
if ((fd = open(rPath, O_RDONLY)) == -1) {
|
||||
// Could not find device. It might be because we're in a VM and
|
||||
// we don't see the whole machine. This is handled silently so
|
||||
// we don't want to print an INFO error.
|
||||
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
free(rPath);
|
||||
char pciClass[9];
|
||||
strncpy(pciClass, "0x000000", 9);
|
||||
int len;
|
||||
SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
|
||||
SYSCHECK(close(fd), "close");
|
||||
if (strcmp(pciClass, "0x068000") == 0) {
|
||||
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
||||
*type = ncclNvLinkDeviceSwitch;
|
||||
} else if (strcmp(pciClass, "0x068001") == 0) {
|
||||
// PCI device is of type "Bridge: IBM Device 04ea"
|
||||
*type = ncclNvLinkDeviceBridge;
|
||||
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
||||
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
||||
*type = ncclNvLinkDeviceGpu;
|
||||
} else {
|
||||
*type = ncclNvLinkDeviceUnknown;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
|
||||
struct ncclTopoNode* cpuNode = NULL;
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
|
||||
}
|
||||
if (cpuNode == NULL) { // Create CPU
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
|
||||
}
|
||||
NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define VEGA_XGMI_WIDTH 20
|
||||
|
||||
ncclResult_t ncclTopoConnectXGMI(int num_gpus, struct ncclTopoSystem* system) {
|
||||
struct ncclTopoNode* nvsNode = NULL;
|
||||
|
||||
int minNvlinks = 2, minWidth = VEGA_XGMI_WIDTH;
|
||||
for (int g1=0; g1<system->nodes[GPU].count; g1++) {
|
||||
for(int g2=0; g2<system->nodes[GPU].count; g2++) {
|
||||
if (g1 == g2) continue;
|
||||
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
||||
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
||||
uint32_t link_type, hops;
|
||||
if (hipExtGetLinkTypeAndHopCount(gpu1->rank, gpu2->rank, &link_type, &hops) == hipSuccess) {
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int pciWidth;
|
||||
NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
|
||||
system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
|
||||
system->maxWidth = minNvlinks ? minWidth : pciWidth;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
|
||||
struct ncclTopoNode* nvsNode = NULL;
|
||||
|
||||
int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int cudaMajor, cudaMinor;
|
||||
NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
|
||||
int maxNvLinks, width;
|
||||
if (cudaMajor < 6) {
|
||||
maxNvLinks = 0;
|
||||
width = 0;
|
||||
} else if (cudaMajor == 6) {
|
||||
maxNvLinks = 4;
|
||||
width = PASCAL_NVLINK_WIDTH;
|
||||
} else {
|
||||
maxNvLinks = 6;
|
||||
width = VOLTA_NVLINK_WIDTH;
|
||||
}
|
||||
|
||||
int nvlinks = 0;
|
||||
for (int l=0; l<maxNvLinks; ++l) {
|
||||
// Check whether we can use this NVLink for P2P
|
||||
unsigned canP2P;
|
||||
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
|
||||
|
||||
// Make sure the Nvlink is up. The previous call should have trained the link.
|
||||
nvmlEnableState_t isActive;
|
||||
if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
|
||||
|
||||
// Try to figure out what's on the other side of the NVLink
|
||||
nvmlPciInfo_t remoteProc;
|
||||
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
|
||||
|
||||
// Make a lower case copy of the bus ID for calling ncclDeviceType
|
||||
// PCI system path is in lower case
|
||||
char* p = remoteProc.busId;
|
||||
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
|
||||
lowerId[c] = tolower(p[c]);
|
||||
if (p[c] == 0) break;
|
||||
}
|
||||
|
||||
enum ncclNvLinkDeviceType type;
|
||||
NCCLCHECK(ncclDeviceType(lowerId, &type));
|
||||
if (type == ncclNvLinkDeviceGpu) {
|
||||
int64_t remoteId;
|
||||
NCCLCHECK(busIdToInt64(lowerId, &remoteId));
|
||||
int peer;
|
||||
NCCLCHECK(idToIndex(system, remoteId, &peer));
|
||||
if (peer != -1) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
|
||||
nvlinks++;
|
||||
}
|
||||
} else if (type == ncclNvLinkDeviceBridge) {
|
||||
// Nvlink between GPU and CPU (PPC)
|
||||
// Since the remote bridge does not have a valid numa_node, assume we
|
||||
// are connected to the closest CPU.
|
||||
char* path;
|
||||
NCCLCHECK(getPath(gpu->id, &path));
|
||||
int numaId = getNumaId(path);
|
||||
free(path);
|
||||
NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
|
||||
nvlinks++;
|
||||
} else { // Nvswitch
|
||||
if (type == ncclNvLinkDeviceUnknown) {
|
||||
// The NVLink is up but we couldn't find the PCI device on the other
|
||||
// side. Assume it's an NVswitch outside a VM.
|
||||
if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
|
||||
}
|
||||
if (nvsNode == NULL) { // Create nvswitch
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
|
||||
}
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
|
||||
NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
|
||||
nvlinks++;
|
||||
}
|
||||
}
|
||||
minNvlinks = std::min(minNvlinks, nvlinks);
|
||||
minWidth = std::min(minWidth, width);
|
||||
}
|
||||
int pciWidth;
|
||||
NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
|
||||
system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
|
||||
system->maxWidth = minNvlinks ? minWidth : pciWidth;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
|
||||
struct ncclTopoNode* lastNode = endNode;
|
||||
int pciWidth;
|
||||
NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
|
||||
// Find intermediate PCI switches
|
||||
int slashCount = 0;
|
||||
int offsetRC = 0;
|
||||
while (offsetRC < strlen(path)) {
|
||||
if (path[offsetRC] == '/') slashCount++;
|
||||
if (slashCount == 4) break;
|
||||
offsetRC++;
|
||||
}
|
||||
int offset = strlen(path);
|
||||
slashCount = 0;
|
||||
while (--offset > offsetRC) {
|
||||
if (path[offset] == '/') {
|
||||
slashCount++;
|
||||
// Find if already existing
|
||||
if ((slashCount%2) == 0) {
|
||||
int64_t pciId;
|
||||
NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
|
||||
for (int p=0; p<system->nodes[PCI].count; p++) {
|
||||
if (system->nodes[PCI].nodes[p].id == pciId) {
|
||||
// Found our PCI switch. Attach and stop since the rest should already
|
||||
// be connected
|
||||
NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
|
||||
NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
struct ncclTopoNode* pciNode;
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
|
||||
NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
|
||||
NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
|
||||
lastNode = pciNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Then attach to a CPU node
|
||||
int numaId = getNumaId(path);
|
||||
int width;
|
||||
NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
|
||||
NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
|
||||
#include <glob.h>
|
||||
#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
|
||||
uint64_t getIbGuid(char* path) {
|
||||
uint64_t guid = 0ULL;
|
||||
char guidPath[PATH_MAX];
|
||||
snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
|
||||
// PATH has a wildcard in it so use glob()
|
||||
glob_t globbuf;
|
||||
glob(guidPath, 0, NULL, &globbuf);
|
||||
if (globbuf.gl_pathc > 0)
|
||||
strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
|
||||
globfree(&globbuf);
|
||||
guidPath[PATH_MAX-1] = '\0';
|
||||
FILE *file = fopen(guidPath, "r");
|
||||
if (file != NULL) {
|
||||
uint64_t a, b, c, d;
|
||||
if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
|
||||
guid = (a << 48) + (b << 32) + (c<<16) + d;
|
||||
TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
return guid;
|
||||
}
|
||||
|
||||
struct netInfo {
|
||||
char* path;
|
||||
int64_t nic;
|
||||
uint64_t asic;
|
||||
int port;
|
||||
int net;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
|
||||
for (int n=0; n<ndev; n++) {
|
||||
struct netInfo* info = netInfos+n;
|
||||
uint64_t ibGuid;
|
||||
info->nic = n;
|
||||
info->asic = n;
|
||||
info->port = 0;
|
||||
info->net = n;
|
||||
if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
|
||||
info->asic = ibGuid;
|
||||
|
||||
// Ignore PCI subdevice when computing the ID to merge multi-port cards
|
||||
// and make them use the same PCI link.
|
||||
char* path = strdup(info->path);
|
||||
path[strlen(path)-1]='0';
|
||||
NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
|
||||
free(path);
|
||||
|
||||
// Same PCI path -> different ports of the same NIC
|
||||
for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
|
||||
|
||||
// Same GUID -> same network links as the other NIC
|
||||
for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
char* path;
|
||||
NCCLCHECK(getPath(gpu->id, &path));
|
||||
NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
|
||||
free(path);
|
||||
}
|
||||
|
||||
// Connect the NICs
|
||||
int netDevCount;
|
||||
NCCLCHECK(ncclNetDevices(&netDevCount));
|
||||
int netWidth;
|
||||
NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
|
||||
|
||||
struct netInfo* netInfos;
|
||||
NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
|
||||
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
|
||||
if (res != ncclSuccess) netInfos[n].path = NULL;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
|
||||
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
struct netInfo* info = netInfos+n;
|
||||
// Create NIC and attach it to the PCI tree
|
||||
struct ncclTopoNode* nicNode = NULL;
|
||||
for (int i=0; i<system->nodes[NIC].count; i++) {
|
||||
if (system->nodes[NIC].nodes[i].id == info->nic) {
|
||||
nicNode = system->nodes[NIC].nodes+i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!nicNode) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
|
||||
if (info->path) {
|
||||
// Create the PCI path
|
||||
NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
|
||||
} else {
|
||||
// This is probably a virtual NIC. Just attach it directly to CPU 0
|
||||
int width;
|
||||
NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
|
||||
NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
|
||||
}
|
||||
}
|
||||
free(info->path);
|
||||
|
||||
// Create the network side
|
||||
struct ncclTopoNode* netNode;
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
|
||||
|
||||
// Use rank to store the net information
|
||||
netNode->rank = info->net;
|
||||
|
||||
NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
|
||||
NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
|
||||
}
|
||||
free(netInfos);
|
||||
|
||||
// And connect all CPU nodes together
|
||||
for (int n=0; n<system->nodes[CPU].count; n++) {
|
||||
for (int p=0; p<system->nodes[CPU].count; p++) {
|
||||
if (n == p) continue;
|
||||
int width;
|
||||
NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
|
||||
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
|
||||
if (node->type == GPU) {
|
||||
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
|
||||
} else {
|
||||
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
for (int i=0; i<offset; i++) line[i] = ' ';
|
||||
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
if (link->type == LINK_LOC) continue;
|
||||
if (link->remNode != prevNode) {
|
||||
sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
|
||||
int nextOffset = strlen(line);
|
||||
if (link->type == LINK_PCI) {
|
||||
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
|
||||
} else {
|
||||
if (link->remNode->type == NET) {
|
||||
sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
|
||||
} else {
|
||||
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
|
||||
char line[1024];
|
||||
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
|
||||
INFO(NCCL_GRAPH, "==========================================");
|
||||
NCCLCHECK(ncclTopoPrintPaths(s));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
|
||||
// Shift all links to have upLink as last link
|
||||
if (upNode) {
|
||||
int l=0;
|
||||
while (node->links[l].remNode != upNode) l++;
|
||||
struct ncclTopoLink upLink;
|
||||
memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
|
||||
while (node->links[l+1].remNode) {
|
||||
memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
|
||||
l++;
|
||||
}
|
||||
memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
|
||||
}
|
||||
|
||||
// Recursively sort the PCI tree
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// We want the graph to be organized to ease/accelerate traversal :
|
||||
// 1. NVLinks (already the case)
|
||||
// 2. PCI down
|
||||
// 3. PCI up
|
||||
// 4. QPI (already the case)
|
||||
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
|
||||
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
|
||||
struct ncclTopoSystem* s;
|
||||
NCCLCHECK(ncclCalloc(&s, 1));
|
||||
nvmlDevice_t* nvmlDevs;
|
||||
int g = 0;
|
||||
NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
|
||||
// Consider the GPU as outside of our node if we can't see it through NVML.
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
|
||||
g++;
|
||||
struct ncclTopoNode* gpuNode;
|
||||
NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
|
||||
gpuNode->rank = r;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
NCCLCHECK(ncclTopoConnectXGMI(g, s));
|
||||
#else
|
||||
NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
|
||||
#endif
|
||||
NCCLCHECK(ncclTopoConnectPCI(s));
|
||||
|
||||
free(nvmlDevs);
|
||||
NCCLCHECK(ncclTopoSortSystem(s));
|
||||
*system = s;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
|
||||
int g1, g2;
|
||||
NCCLCHECK(idToIndex(system, busId1, &g1));
|
||||
NCCLCHECK(idToIndex(system, busId2, &g2));
|
||||
*nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
|
||||
int g;
|
||||
NCCLCHECK(idToIndex(system, busId, &g));
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (i == g) continue;
|
||||
if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
|
||||
*nvlink = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
*nvlink = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int pathDistance(struct ncclTopoLinkList* links) {
|
||||
int distance = PATH_PIX;
|
||||
if (links->count > 2) distance = PATH_PXB;
|
||||
for (int l=0; l<links->count; l++) {
|
||||
// PHB if we go through 1 CPU, SYS if we go through 2 CPUs
|
||||
if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
|
||||
int g1, g2;
|
||||
NCCLCHECK(idToIndex(system, busId1, &g1));
|
||||
NCCLCHECK(idToIndex(system, busId2, &g2));
|
||||
*distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
|
||||
int g;
|
||||
NCCLCHECK(idToIndex(system, busId, &g));
|
||||
*distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
|
||||
*count = system->nodes[CPU].count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_TOPO_H_
|
||||
#define NCCL_TOPO_H_
|
||||
|
||||
#include "graph.h"
|
||||
#include "core.h"
|
||||
|
||||
#define LOC_WIDTH 5000
|
||||
#define PASCAL_NVLINK_WIDTH 18
|
||||
#define VOLTA_NVLINK_WIDTH 21
|
||||
#define PCI_WIDTH 12 // PCI Gen3 x16
|
||||
#define QPI_WIDTH 8
|
||||
#define SKL_QPI_WIDTH 12
|
||||
#define P9_WIDTH 32
|
||||
#define NET_WIDTH 12 // 100Gbit
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
|
||||
// to GPU traffic consumed more PCI bandwidth.
|
||||
#define INTEL_P2P(speed) (speed*9/12)
|
||||
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
|
||||
|
||||
#define NCCL_TOPO_NODE_TYPES 6
|
||||
#define GPU 0
|
||||
#define PCI 1
|
||||
#define NVS 2
|
||||
#define CPU 3 // Actually NUMA domains
|
||||
#define NIC 4
|
||||
#define NET 5
|
||||
extern const char* topoNodeTypeStr[];
|
||||
|
||||
#define LINK_LOC 0
|
||||
#define LINK_NVL 1
|
||||
#define LINK_PCI 2
|
||||
#define LINK_QPI 3
|
||||
#define LINK_NET 4
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
struct ncclTopoNode;
|
||||
struct ncclTopoLink {
|
||||
int type;
|
||||
int width;
|
||||
struct ncclTopoNode* remNode;
|
||||
};
|
||||
#define NCCL_TOPO_MAX_LINKS 32
|
||||
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
|
||||
#define SELECT_PATH 1
|
||||
#define SELECT_LAST 2
|
||||
|
||||
#define NET_GDR_MASK 0x70000000
|
||||
|
||||
struct ncclTopoLinkList {
|
||||
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
|
||||
int count;
|
||||
int width;
|
||||
int type;
|
||||
};
|
||||
|
||||
struct ncclTopoNode {
|
||||
int type;
|
||||
int64_t id;
|
||||
int rank;
|
||||
int nlinks;
|
||||
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
|
||||
// Pre-computed paths to GPUs and NICs
|
||||
struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
|
||||
// Used during search
|
||||
uint64_t used;
|
||||
};
|
||||
|
||||
struct ncclTopoNodeSet {
|
||||
int count;
|
||||
struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
|
||||
};
|
||||
|
||||
struct ncclTopoSystem {
|
||||
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
||||
int maxSpeed;
|
||||
int maxWidth;
|
||||
int searchInitDone;
|
||||
};
|
||||
|
||||
static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
|
||||
for (int i=0; i<system->nodes[type].count; i++) {
|
||||
if (system->nodes[type].nodes[i].id == id) {
|
||||
*node = system->nodes[type].nodes+i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
|
||||
WARN("Error : tried to create too many nodes of type %d\n", type);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
|
||||
system->nodes[type].count++;
|
||||
n->type = type;
|
||||
n->id = id;
|
||||
if (type == GPU) {
|
||||
// Create link to itself (used in some corner cases)
|
||||
n->nlinks=1;
|
||||
n->links[0].type = LINK_LOC;
|
||||
n->links[0].remNode = n;
|
||||
n->links[0].width = LOC_WIDTH;
|
||||
}
|
||||
*node = n;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
|
||||
// Aggregate links into higher width for NVLink
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode == remNode && link->type == type) break;
|
||||
}
|
||||
if (link->remNode == NULL) node->nlinks++;
|
||||
link->type = type;
|
||||
link->remNode = remNode;
|
||||
link->width += width;
|
||||
|
||||
// Sort links in BW descending order
|
||||
struct ncclTopoLink linkSave;
|
||||
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
|
||||
while (link != node->links) {
|
||||
if ((link-1)->width >= linkSave.width) break;
|
||||
memcpy(link, link-1, sizeof(struct ncclTopoLink));
|
||||
link--;
|
||||
}
|
||||
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
||||
|
||||
#endif
|
||||
@@ -4,9 +4,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "net.h"
|
||||
#include "param.h"
|
||||
#include "nccl.h"
|
||||
|
||||
#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
|
||||
|
||||
@@ -0,0 +1,213 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "comm.h"
|
||||
#include "topo.h"
|
||||
|
||||
NCCL_PARAM(Nthreads, "NTHREADS", -2);
|
||||
NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
|
||||
|
||||
static int getNthreads(const char* name, int env, int min, int max, int def) {
|
||||
int nt = env;
|
||||
if (nt > 0) {
|
||||
if (nt % WARP_SIZE != 0) {
|
||||
WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
|
||||
nt = max;
|
||||
} else if (nt > max) {
|
||||
WARN("Invalid %s %d (maximum %d).", name, nt, max);
|
||||
nt = max;
|
||||
} else if (nt < min) {
|
||||
WARN("Invalid %s %d (minimum %d).", name, nt, min);
|
||||
nt = min;
|
||||
}
|
||||
} else {
|
||||
nt = def;
|
||||
}
|
||||
return nt;
|
||||
}
|
||||
|
||||
ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
|
||||
int def, set;
|
||||
if (str[0] == '^') {
|
||||
def = 1; set = 0; str++;
|
||||
} else {
|
||||
def = 0; set = 1;
|
||||
}
|
||||
for (int i=0; i<nelems; i++) list[i] = def;
|
||||
char* tokStr = strdup(str);
|
||||
char* tmpStr;
|
||||
char* token = strtok_r(tokStr, ",", &tmpStr);
|
||||
while (token) {
|
||||
for (int i=0; i<nelems; i++)
|
||||
if (strcasecmp(token, elems[i]) == 0) list[i] = set;
|
||||
token = strtok_r(NULL, ",", &tmpStr);
|
||||
}
|
||||
free(tokStr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||
static const char* ncclAlgoStr[] = { "Tree", "Ring" };
|
||||
static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
|
||||
|
||||
// Latencies in us, Bandwidths in GB/s
|
||||
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
||||
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } };
|
||||
|
||||
// NVLink, PCI, Network
|
||||
#define NCCL_HW_NVLINK 0
|
||||
#define NCCL_HW_PCI 1
|
||||
#define NCCL_HW_NET 2
|
||||
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
|
||||
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
{ /* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } }
|
||||
};
|
||||
|
||||
// LL128 max BW for the different collectives
|
||||
static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
|
||||
|
||||
ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
|
||||
int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
|
||||
INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
|
||||
|
||||
if (comm->nRanks <= 1) return ncclSuccess;
|
||||
|
||||
struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
|
||||
int intraHw[2], hw[2];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
|
||||
comm->nRanks;
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = graphs[a]->nChannels * speed * 1.0;
|
||||
|
||||
// Various model refinements
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0;
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = hwLat[hw[a]][a][p];
|
||||
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
|
||||
if (ringGraph->sameChannels) {
|
||||
comm->latencies[coll][a][p] += lat;
|
||||
} else {
|
||||
if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
|
||||
comm->latencies[coll][a][p] += nsteps*lat;
|
||||
}
|
||||
} else {
|
||||
comm->latencies[coll][a][p] += nsteps*lat;
|
||||
}
|
||||
} else {
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Protocols/Algorithms enable/disable, and user overrides.
|
||||
// All are enabled except ll128 which is enabled by default only in certain cases.
|
||||
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
|
||||
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
|
||||
const char *algoStr = getenv("NCCL_ALGO");
|
||||
if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int pEnable = protoEnable[p];
|
||||
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
||||
// Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||
pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
|
||||
}
|
||||
if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
}
|
||||
|
||||
if (comm->rank == 0) {
|
||||
char line[1024];
|
||||
int offset = 0;
|
||||
sprintf(line, "Latency/AlgBw |");
|
||||
offset = strlen(line);
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
}
|
||||
INFO(NCCL_TUNING, "%s", line);
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
|
||||
sprintf(line, "%13s |", ncclFuncStr[c]);
|
||||
offset = strlen(line);
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
}
|
||||
INFO(NCCL_TUNING, "%s", line);
|
||||
}
|
||||
}
|
||||
|
||||
// Set per-thread amount of work before we increase nThreads and nChannels
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
|
||||
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
|
||||
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
||||
}
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
|
||||
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
if (str) {
|
||||
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { -2 };
|
||||
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
|
||||
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -52,11 +52,6 @@ struct ncclAsyncArgs {
|
||||
|
||||
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
|
||||
|
||||
ncclResult_t ncclSetDevice(int cudaDev) {
|
||||
CUDACHECK(hipSetDevice(cudaDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define CHECK(a) do { \
|
||||
if ((args->ret = (a)) != ncclSuccess) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
@@ -66,15 +61,14 @@ ncclResult_t ncclSetDevice(int cudaDev) {
|
||||
|
||||
void* ncclAsyncThreadMain(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
CHECK(ncclSetDevice(args->init.cudaDev));
|
||||
CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
|
||||
CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
|
||||
return args;
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
|
||||
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
|
||||
return ncclAsyncErrCheck(ncclInternalError);
|
||||
return ncclAsyncErrCheck(ncclInvalidUsage);
|
||||
}
|
||||
int index = ncclGroupIndex++;
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+index;
|
||||
@@ -85,8 +79,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm
|
||||
args->init.ndev = ndev;
|
||||
memcpy(&args->init.commId, &commId, sizeof(commId));
|
||||
args->init.myrank = myrank;
|
||||
// We need to use threads for Init
|
||||
pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -98,7 +90,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
|
||||
}
|
||||
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
|
||||
return ncclAsyncErrCheck(ncclInternalError);
|
||||
return ncclAsyncErrCheck(ncclInvalidUsage);
|
||||
}
|
||||
ncclGroupIndex++;
|
||||
args->funcType = ASYNC_FUNC_COLL;
|
||||
@@ -125,6 +117,14 @@ ncclResult_t ncclGroupEnd() {
|
||||
ncclResult_t ret = ncclGroupError;
|
||||
if (ret != ncclSuccess) goto group_cleanup;
|
||||
|
||||
/* Launch async ncclCommInitRank */
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
|
||||
}
|
||||
}
|
||||
|
||||
/* Collectives are done in three steps :
|
||||
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
|
||||
* 2. Barrier Wait. No CUDA call is permitted
|
||||
@@ -167,8 +167,8 @@ ncclResult_t ncclGroupEnd() {
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
|
||||
if (err == EBUSY) continue;
|
||||
if (err != 0) { ret = ncclSystemError; goto end; }
|
||||
if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
|
||||
if (err != 0) ret = ncclSystemError;
|
||||
if (args->ret != ncclSuccess) ret = args->ret;
|
||||
doneArray[i] = 1;
|
||||
done--;
|
||||
}
|
||||
@@ -176,20 +176,47 @@ ncclResult_t ncclGroupEnd() {
|
||||
}
|
||||
goto end;
|
||||
group_cleanup:
|
||||
// At least one call in the group failed. Since we want to make that group
|
||||
// an atomic operation, we need to cancel all operations.
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
STORE(&channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active, 0);
|
||||
if (ret != ncclSuccess) {
|
||||
// At least one call in the group failed. Since we want to make that group
|
||||
// an atomic operation, we need to cancel all operations.
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
|
||||
}
|
||||
channel->collFifoTail = channel->collStart;
|
||||
channel->collCount = 0;
|
||||
}
|
||||
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs *op, *start;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = start = state->ops;
|
||||
while (op) {
|
||||
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
|
||||
struct ncclProxyArgs* peerOp = op->nextPeer;
|
||||
while (peerOp) {
|
||||
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
|
||||
peerOp = peerOp->nextPeer;
|
||||
}
|
||||
op = op->next;
|
||||
if (op == start) break;
|
||||
}
|
||||
comm->opCount = comm->lastOpCount;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
|
||||
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
|
||||
comm->userStreamSet = false;
|
||||
}
|
||||
channel->collFifoTail = channel->collStart;
|
||||
channel->collCount = 0;
|
||||
}
|
||||
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
|
||||
comm->userStreamSet = false;
|
||||
}
|
||||
end:
|
||||
ncclGroupError = ncclSuccess;
|
||||
@@ -8,6 +8,7 @@
|
||||
#define NCCL_ARGCHECK_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "info.h"
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
|
||||
@@ -17,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
ncclResult_t bootstrapAbort(void* commState);
|
||||
#endif
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#ifndef NCCL_CHANNEL_H_
|
||||
#define NCCL_CHANNEL_H_
|
||||
#include "core.h"
|
||||
#include "comm.h"
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -9,7 +9,10 @@
|
||||
#ifndef NCCL_COLLECTIVES_H_
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
|
||||
#include "core.h"
|
||||
#include "info.h"
|
||||
|
||||
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
|
||||
|
||||
#define NCCL_COLL_NAME(coll, op, dtype) \
|
||||
coll##_##op##_##dtype
|
||||
@@ -24,7 +27,8 @@
|
||||
|
||||
#define DECL_COLL4(coll, op, dtype) \
|
||||
DECL_COLL5(coll, op, dtype) \
|
||||
DECL_COLL5(coll##LL, op, dtype)
|
||||
DECL_COLL5(coll##LL, op, dtype) \
|
||||
DECL_COLL5(coll##LL128, op, dtype)
|
||||
|
||||
#define DECL_COLL3(coll, op, dtype) \
|
||||
DECL_COLL4(coll##Ring, op, dtype) \
|
||||
@@ -8,13 +8,33 @@
|
||||
#ifndef NCCL_COMM_H_
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
#include "transport.h"
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
void *func;
|
||||
dim3 gridDim;
|
||||
dim3 blockDim;
|
||||
void **args;
|
||||
size_t sharedMem;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#define MEM_ALIGN 4096
|
||||
#define CUDA_IPC_MIN 2097152UL
|
||||
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_THREAD_THRESHOLD 8
|
||||
#define NCCL_LL128_THREAD_THRESHOLD 8
|
||||
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
|
||||
|
||||
struct ncclSendMem {
|
||||
union {
|
||||
struct {
|
||||
@@ -40,6 +60,7 @@ struct ncclRecvMem {
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||
uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
@@ -47,13 +68,18 @@ struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
struct ncclTopoSystem* topo;
|
||||
|
||||
void* bootstrap;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
int nvmlDev; // my NVML device number
|
||||
int64_t busId; // my PCI bus ID in int format
|
||||
|
||||
int node;
|
||||
int nNodes;
|
||||
int localRanks;
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
hipStream_t userStream;
|
||||
@@ -64,17 +90,19 @@ struct ncclComm {
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
uint64_t opCount;
|
||||
uint64_t lastOpCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
|
||||
// Low-latency algorithm threshold
|
||||
ssize_t llThreshold;
|
||||
ssize_t threadThreshold;
|
||||
// Only nvlink is used for inter-GPU communication
|
||||
int nvlink;
|
||||
|
||||
// Tree algorithm threshold
|
||||
ssize_t treeThreshold;
|
||||
// Algorithm/Protocols thresholds
|
||||
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
int maxThreads[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
|
||||
+22
-15
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,19 +9,11 @@
|
||||
#define NCCL_CORE_H_
|
||||
|
||||
#include <pthread.h>
|
||||
#include <algorithm>
|
||||
#include "nccl.h"
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
#include "transport.h"
|
||||
#include "devcomm.h"
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "argcheck.h"
|
||||
#include <cstdio>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm> // For std::min/std::max
|
||||
#include "nccl.h"
|
||||
|
||||
#ifdef PROFAPI
|
||||
#define NCCL_API(ret, func, args...) \
|
||||
@@ -38,10 +31,6 @@
|
||||
ret func(args)
|
||||
#endif // end PROFAPI
|
||||
|
||||
int ncclCudaCompCap();
|
||||
ncclResult_t ncclNvlinkGpu(int* nvlink);
|
||||
int64_t ncclTreeThreshold();
|
||||
|
||||
static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
@@ -63,4 +52,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
}
|
||||
}
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_LL 0
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
#include "utils.h"
|
||||
#include "param.h"
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
+11
-102
@@ -7,15 +7,14 @@
|
||||
#ifndef NCCL_DEBUG_H_
|
||||
#define NCCL_DEBUG_H_
|
||||
|
||||
#include <pthread.h>
|
||||
#include "core.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include "nccl.h"
|
||||
#include "nccl_net.h"
|
||||
|
||||
#define gettid() (pid_t) syscall(SYS_gettid)
|
||||
@@ -25,9 +24,16 @@ extern uint64_t ncclDebugMask;
|
||||
extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
|
||||
|
||||
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
|
||||
// Let code temporarily downgrade WARN into INFO
|
||||
extern thread_local int ncclDebugNoWarn;
|
||||
#define NOWARN(a, ret) do { \
|
||||
ncclDebugNoWarn = 1; \
|
||||
ret = a; \
|
||||
ncclDebugNoWarn = 0; \
|
||||
} while (0)
|
||||
|
||||
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
@@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#define TRACE(...)
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
static inline void initDebug() {
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
|
||||
/* Parse the NCCL_DEBUG_SUBSYS env var
|
||||
* This can be a comma separated list such as INIT,COLL
|
||||
* or ^INIT,COLL etc
|
||||
*/
|
||||
char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
|
||||
if (nccl_debug_subsys != NULL) {
|
||||
char *subsys = strtok(nccl_debug_subsys, ",");
|
||||
while (subsys != NULL) {
|
||||
int invert = 0;
|
||||
uint64_t mask = 0;
|
||||
if (subsys[0] == '^') { invert = 1; subsys++; }
|
||||
if (strcasecmp(subsys, "INIT") == 0) {
|
||||
mask = NCCL_INIT;
|
||||
} else if (strcasecmp(subsys, "COLL") == 0) {
|
||||
mask = NCCL_COLL;
|
||||
} else if (strcasecmp(subsys, "P2P") == 0) {
|
||||
mask = NCCL_P2P;
|
||||
} else if (strcasecmp(subsys, "SHM") == 0) {
|
||||
mask = NCCL_SHM;
|
||||
} else if (strcasecmp(subsys, "NET") == 0) {
|
||||
mask = NCCL_NET;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
if (mask) {
|
||||
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
|
||||
}
|
||||
subsys = strtok(NULL, ",");
|
||||
}
|
||||
}
|
||||
|
||||
/* Parse and expand the NCCL_DEBUG_FILE path and
|
||||
* then create the debug file. But don't bother unless the
|
||||
* NCCL_DEBUG level is > VERSION
|
||||
*/
|
||||
const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
|
||||
if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
|
||||
int c = 0;
|
||||
char debug_fn[PATH_MAX+1] = "";
|
||||
char *dfn = debug_fn;
|
||||
while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
|
||||
if (nccl_debug_file[c++] != '%') {
|
||||
*dfn++ = nccl_debug_file[c-1];
|
||||
continue;
|
||||
}
|
||||
switch (nccl_debug_file[c++]) {
|
||||
case '%': // Double %
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
*dfn++ = nccl_debug_file[c-1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
*dfn = '\0';
|
||||
if (debug_fn[0] != '\0') {
|
||||
FILE *file = fopen(debug_fn, "w");
|
||||
if (file != NULL) {
|
||||
INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
|
||||
ncclDebugFile = file;
|
||||
}
|
||||
}
|
||||
}
|
||||
pthread_mutex_init(&ncclDebugOutputLock, NULL);
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
ncclEpoch = std::chrono::high_resolution_clock::now();
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -24,8 +24,6 @@
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define ROUNDUP(x, y) \
|
||||
@@ -49,16 +47,18 @@ union ncclLLFifoLine {
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
#define MAXTHREADS 256
|
||||
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
||||
#define NUM_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define WARP_SIZE 64
|
||||
#define MAXCHANNELS 32
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||
#ifdef DEBUG_LL
|
||||
#define NCCL_LL_CLEAN_MASK 0x00000ff8
|
||||
#define NCCL_LL_FLAG_MAX 0x00001000
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
|
||||
#ifdef TEST_LL_CLEANUP
|
||||
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
|
||||
#define NCCL_LL_FLAG_MAX 0x100
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
|
||||
#else
|
||||
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
|
||||
@@ -66,6 +66,24 @@ union ncclLLFifoLine {
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
#define NCCL_LL128_LINESIZE 64
|
||||
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
|
||||
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
#define NCCL_LL128_MAX_NTHREADS 256
|
||||
#define NCCL_LL128_ELEMS_PER_THREAD 120
|
||||
|
||||
// Receiving from up to 3 sources is more compute intensive than sending
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) (nt/2)
|
||||
|
||||
#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
|
||||
#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
@@ -90,6 +108,9 @@ struct ncclConnInfo {
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
|
||||
// High bandwidth, low latency protocol
|
||||
uint64_t* ll128Buff; // Local for recv, remote for send
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -167,7 +188,8 @@ struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree treeUp;
|
||||
struct ncclTree treeDn;
|
||||
|
||||
int id;
|
||||
int nthreads;
|
||||
@@ -186,6 +208,7 @@ struct ncclChannel {
|
||||
int collFifoTail; // Only used by CPU
|
||||
|
||||
uint32_t* abortCount;
|
||||
uint32_t* sync;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
@@ -193,8 +216,6 @@ struct ncclChannel {
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf {
|
||||
union {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,15 +7,9 @@
|
||||
#ifndef NCCL_ENQUEUE_H_
|
||||
#define NCCL_ENQUEUE_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "comm.h"
|
||||
#include "group.h"
|
||||
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA
|
||||
#define NCCL_LL_MIN_NTHREADS 256
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_GRAPH_H_
|
||||
#define NCCL_GRAPH_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "devcomm.h"
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
enum ncclPathDist {
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_NODE = 3,
|
||||
PATH_SYS = 4,
|
||||
PATH_ARRAY_SIZE = 5
|
||||
};
|
||||
|
||||
extern const char* pathDists[PATH_ARRAY_SIZE];
|
||||
|
||||
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
|
||||
|
||||
struct ncclTopoSystem;
|
||||
// Build the topology
|
||||
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
|
||||
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
|
||||
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
|
||||
void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
|
||||
ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
|
||||
ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
|
||||
ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
|
||||
ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
int pattern;
|
||||
int crossNic;
|
||||
// Output
|
||||
int nChannels;
|
||||
int speedIntra;
|
||||
int speedInter;
|
||||
int type;
|
||||
int nvlink;
|
||||
int sameChannels;
|
||||
int nHops;
|
||||
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
|
||||
int inter[MAXCHANNELS*2];
|
||||
};
|
||||
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
|
||||
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
|
||||
struct ncclTopoRanks {
|
||||
int ringRecv[MAXCHANNELS];
|
||||
int ringSend[MAXCHANNELS];
|
||||
int ringPrev[MAXCHANNELS];
|
||||
int ringNext[MAXCHANNELS];
|
||||
int treeUpRecv[MAXCHANNELS];
|
||||
int treeUpSend[MAXCHANNELS];
|
||||
int treeDnRecv[MAXCHANNELS];
|
||||
int treeDnSend[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings);
|
||||
|
||||
ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
|
||||
|
||||
#endif
|
||||
@@ -9,14 +9,14 @@
|
||||
#define NCCL_GROUP_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
#include "comm.h"
|
||||
|
||||
bool ncclAsyncMode();
|
||||
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
|
||||
|
||||
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
|
||||
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
||||
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
||||
|
||||
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,6 +9,7 @@
|
||||
#define NCCL_INFO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
|
||||
typedef enum {
|
||||
ncclPatternRing,
|
||||
@@ -21,7 +23,7 @@ typedef enum {
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclColl_t coll;
|
||||
ncclFunc_t coll;
|
||||
const char* opName;
|
||||
// NCCL Coll Args
|
||||
const void* sendbuff;
|
||||
@@ -36,7 +38,11 @@ struct ncclInfo {
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
int algorithm;
|
||||
int protocol;
|
||||
ncclPattern_t pattern;
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
size_t nBytes;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -17,7 +18,6 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
static const char* ncclNetName() { return ncclNet->name; }
|
||||
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
@@ -31,6 +31,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
|
||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
#define GPU_BUF_SIZE (2*1024*1024)
|
||||
static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
|
||||
int support;
|
||||
NCCLCHECK(ncclNet->ptrSupport(dev, &support));
|
||||
*supportedTypes = support & ~NCCL_PTR_CUDA;
|
||||
// The network supports GPU Direct RDMA ; verify the GPU supports it as well.
|
||||
if (support & NCCL_PTR_CUDA) {
|
||||
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
|
||||
ncclNetHandle_t handle;
|
||||
void* gpuPtr = NULL;
|
||||
void* mHandle = NULL;
|
||||
ncclResult_t res;
|
||||
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
|
||||
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
|
||||
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
|
||||
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
|
||||
NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
|
||||
if (res != ncclSuccess) goto cleanup;
|
||||
NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
|
||||
NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
|
||||
NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
|
||||
*supportedTypes |= NCCL_PTR_CUDA;
|
||||
cleanup:
|
||||
if (gpuPtr) hipFree(gpuPtr);
|
||||
if (rComm) ncclNetCloseRecv(rComm);
|
||||
if (sComm) ncclNetCloseSend(sComm);
|
||||
if (lComm) ncclNetCloseListen(lComm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
extern ncclNet_t ncclNetIb;
|
||||
extern ncclNet_t ncclNetSocket;
|
||||
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVLINK_H_
|
||||
#define NCCL_NVLINK_H_
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "nvmlwrap.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define CONNECT_NVLINK 0x10
|
||||
#define CONNECT_NVSWITCH 0x100
|
||||
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
||||
char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
|
||||
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
||||
char* rPath = realpath(classPath, NULL);
|
||||
int fd;
|
||||
if ((fd = open(rPath, O_RDONLY)) == -1) {
|
||||
// Could not find device. It might be because we're in a VM and
|
||||
// we don't see the whole machine. This is handled silently so
|
||||
// we don't want to print an INFO error.
|
||||
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
free(rPath);
|
||||
char pciClass[9];
|
||||
strncpy(pciClass, "0x000000", 9);
|
||||
int len;
|
||||
SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
|
||||
SYSCHECK(close(fd), "close");
|
||||
if (strcmp(pciClass, "0x068000") == 0) {
|
||||
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
||||
*type = ncclNvLinkDeviceSwitch;
|
||||
} else if (strcmp(pciClass, "0x068001") == 0) {
|
||||
// PCI device is of type "Bridge: IBM Device 04ea"
|
||||
*type = ncclNvLinkDeviceBridge;
|
||||
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
||||
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
||||
*type = ncclNvLinkDeviceGpu;
|
||||
} else {
|
||||
// Ignore if we don't know what's on the other side.
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Get the maximum number of NVLinks based on the GPU generation */
|
||||
static ncclResult_t getMaxNvlinks(int* maxLinks) {
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
int ccMajor;
|
||||
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
|
||||
// 6 for Volta, 4 for Pascal
|
||||
*maxLinks = (ccMajor > 6) ? 6 : 4;
|
||||
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
// Determine if that connection is through NVLink
|
||||
int links = 0;
|
||||
int nvswitch_links = 0;
|
||||
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
|
||||
nvmlDevice_t nvmlDev;
|
||||
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
|
||||
if (res != ncclSuccess) return 0;
|
||||
|
||||
for(int l=0; l<maxNvLinks; ++l) {
|
||||
// Check whether we can use this NVLink for P2P
|
||||
unsigned canP2P;
|
||||
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
|
||||
|
||||
// Make sure the Nvlink is up. The previous call should have trained the link.
|
||||
nvmlEnableState_t isActive;
|
||||
if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
|
||||
|
||||
// Try to figure out what's on the other side of the NVLink
|
||||
nvmlPciInfo_t remoteProc;
|
||||
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
|
||||
|
||||
// Old versions of NVML return a lowercase PCI ID
|
||||
char* p = remoteProc.busId;
|
||||
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
|
||||
if (p[c] == 0) break;
|
||||
p[c] = toupper(p[c]);
|
||||
}
|
||||
|
||||
if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
|
||||
links++;
|
||||
} else {
|
||||
// Make a lower case copy of the bus ID for calling ncclDeviceType
|
||||
// PCI system path is in lower case
|
||||
char* p = remoteProc.busId;
|
||||
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
|
||||
if (p[c] == 0) break;
|
||||
lowerId[c] = tolower(p[c]);
|
||||
}
|
||||
|
||||
// Determine if the remote side is NVswitch or a GPU
|
||||
enum ncclNvLinkDeviceType type;
|
||||
ncclResult_t ret = ncclDeviceType(lowerId, &type);
|
||||
if (ret == ncclSuccess) {
|
||||
if (type == ncclNvLinkDeviceSwitch) {
|
||||
//TODO: we are making an assumption that all GPUs are connected to this switch
|
||||
//This assumption may change for future architectures
|
||||
nvswitch_links++;
|
||||
} else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
|
||||
links++;
|
||||
}
|
||||
} else {
|
||||
// The NVLink is up but we couldn't find the PCI device on the other
|
||||
// side. Assume it's an NVswitch outside a VM.
|
||||
if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
|
||||
nvswitch_links++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,30 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVLINK_H_
|
||||
#define NCCL_NVLINK_H_
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "nvmlwrap.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define CONNECT_NVLINK 0x10
|
||||
#define CONNECT_NVSWITCH 0x100
|
||||
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
int links = 0;
|
||||
return CONNECT_NVLINK*links;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -9,18 +9,31 @@
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
//#define NVML_DIRECT 1
|
||||
#ifdef NVML_DIRECT
|
||||
#include "nvml.h"
|
||||
// The NVML library doesn't appear to be thread safe
|
||||
#include <pthread.h>
|
||||
extern pthread_mutex_t nvmlLock;
|
||||
#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
|
||||
#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
|
||||
|
||||
#define NVMLLOCKCALL(cmd, ret) do { \
|
||||
NVMLLOCK(); \
|
||||
ret = cmd; \
|
||||
NVMLUNLOCK(); \
|
||||
} while(false)
|
||||
|
||||
#define NVMLCHECK(cmd) do { \
|
||||
nvmlReturn_t e = cmd; \
|
||||
nvmlReturn_t e; \
|
||||
NVMLLOCKCALL(cmd, e); \
|
||||
if( e != NVML_SUCCESS ) { \
|
||||
WARN("NVML failure '%s'", nvmlErrorString(e)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
//#define NVML_DIRECT 1
|
||||
#ifdef NVML_DIRECT
|
||||
#include "nvml.h"
|
||||
|
||||
static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
|
||||
static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
|
||||
static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
|
||||
@@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i
|
||||
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
|
||||
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
// Dynamically handle dependencies on NVML
|
||||
|
||||
@@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||
|
||||
#endif // NVML_DIRECT
|
||||
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_RINGS_H_
|
||||
#define NCCL_RINGS_H_
|
||||
|
||||
static int getDefaultThreads() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
return 256;
|
||||
#else // On Kepler, rings are doubled later.
|
||||
return ncclCudaCompCap() == 3 ? 128 : 256;
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
|
||||
|
||||
#endif
|
||||
@@ -40,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
|
||||
CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
|
||||
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
|
||||
CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError);
|
||||
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError);
|
||||
|
||||
*shmPtr = ptr;
|
||||
return ncclSuccess;
|
||||
sysError:
|
||||
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
|
||||
hipError_t:
|
||||
hipError:
|
||||
if (fd != -1) close(fd);
|
||||
if (create) shm_unlink(shmname);
|
||||
if (ptr != MAP_FAILED) munmap(ptr, shmsize);
|
||||
|
||||
@@ -66,7 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
||||
#endif
|
||||
struct netIf userIfs[MAX_IFS];
|
||||
bool searchNot = prefixList && prefixList[0] == '^';
|
||||
if (searchNot) prefixList++;
|
||||
bool searchExact = prefixList && prefixList[0] == '=';
|
||||
if (searchExact) prefixList++;
|
||||
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
|
||||
|
||||
int found = 0;
|
||||
@@ -118,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
||||
return found;
|
||||
}
|
||||
|
||||
static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
|
||||
static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
|
||||
/* Check family first */
|
||||
int family = local_if.ifa_addr->sa_family;
|
||||
if (family != remote.sa.sa_family) {
|
||||
if (family != remote->sa.sa_family) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (family == AF_INET) {
|
||||
struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
|
||||
struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
|
||||
struct sockaddr_in& remote_addr = remote.sin;
|
||||
struct sockaddr_in& remote_addr = remote->sin;
|
||||
struct in_addr local_subnet, remote_subnet;
|
||||
local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
|
||||
remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
|
||||
@@ -136,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
|
||||
} else if (family == AF_INET6) {
|
||||
struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
|
||||
struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
|
||||
struct sockaddr_in6& remote_addr = remote.sin6;
|
||||
struct sockaddr_in6& remote_addr = remote->sin6;
|
||||
struct in6_addr& local_in6 = local_addr->sin6_addr;
|
||||
struct in6_addr& mask_in6 = mask->sin6_addr;
|
||||
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
|
||||
@@ -161,7 +163,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
|
||||
}
|
||||
}
|
||||
|
||||
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
#endif
|
||||
@@ -189,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
|
||||
// Store the interface name
|
||||
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||
|
||||
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
|
||||
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a));
|
||||
found++;
|
||||
if (found == maxIfs) break;
|
||||
}
|
||||
|
||||
if (found == 0) {
|
||||
WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
|
||||
WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a));
|
||||
}
|
||||
freeifaddrs(interfaces);
|
||||
return found;
|
||||
@@ -300,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
|
||||
// Try to find interface that is in the same subnet as the IP in comm id
|
||||
union socketAddress idAddr;
|
||||
GetSocketAddrFromString(&idAddr, commId);
|
||||
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
|
||||
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
||||
}
|
||||
}
|
||||
// Then look for anything else (but not docker or lo)
|
||||
@@ -387,7 +389,7 @@ retry:
|
||||
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
|
||||
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
|
||||
if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_TOPO_H_
|
||||
#define NCCL_TOPO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
ncclResult_t getCudaPath(int cudaDev, char** path);
|
||||
|
||||
static int getNumaId(char *path) {
|
||||
char npath[PATH_MAX];
|
||||
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||
npath[PATH_MAX-1] = '\0';
|
||||
|
||||
int numaId = -1;
|
||||
FILE *file = fopen(npath, "r");
|
||||
if (file == NULL) return -1;
|
||||
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
|
||||
fclose(file);
|
||||
|
||||
return numaId;
|
||||
}
|
||||
|
||||
enum ncclPathDist {
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_NODE = 3,
|
||||
PATH_SYS = 4,
|
||||
PATH_ARRAY_SIZE = 5
|
||||
};
|
||||
|
||||
extern const char* pathDists[PATH_ARRAY_SIZE];
|
||||
|
||||
int pciDistance(char* path1, char* path2);
|
||||
|
||||
#endif
|
||||
@@ -7,12 +7,15 @@
|
||||
#ifndef NCCL_TRANSPORT_H_
|
||||
#define NCCL_TRANSPORT_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "devcomm.h"
|
||||
#include <stdint.h>
|
||||
#include "graph.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "core.h"
|
||||
|
||||
#define NTRANSPORTS 3
|
||||
#define TRANSPORT_P2P 0
|
||||
#define TRANSPORT_SHM 1
|
||||
#define TRANSPORT_NET 2
|
||||
|
||||
extern struct ncclTransport ncclTransports[];
|
||||
|
||||
@@ -24,15 +27,13 @@ struct ncclComm;
|
||||
struct ncclPeerInfo {
|
||||
int rank;
|
||||
int cudaDev;
|
||||
int nvmlDev;
|
||||
int gdrSupport;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
dev_t shmDev;
|
||||
int64_t busId;
|
||||
};
|
||||
|
||||
// Used to hold the transport connection values
|
||||
typedef int64_t ncclTvalue_t;
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
@@ -51,7 +52,7 @@ struct ncclProxyArgs {
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int llMode;
|
||||
int protocol;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
@@ -78,7 +79,7 @@ struct ncclProxyState {
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
|
||||
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
|
||||
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
@@ -86,8 +87,7 @@ struct ncclTransportComm {
|
||||
|
||||
struct ncclTransport {
|
||||
const char name[4];
|
||||
ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
|
||||
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||
struct ncclTransportComm send;
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,6 +10,14 @@
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
int ncclCudaCompCap();
|
||||
|
||||
// PCI Bus ID <-> int64 conversion functions
|
||||
ncclResult_t int64ToBusId(int64_t id, char* busId);
|
||||
ncclResult_t busIdToInt64(char* busId, int64_t* id);
|
||||
|
||||
ncclResult_t getBusId(int cudaDev, int64_t *busId);
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getHash(const char* string, int n);
|
||||
uint64_t getHostHash();
|
||||
@@ -24,4 +31,10 @@ struct netIf {
|
||||
int parseStringList(const char* string, struct netIf* ifList, int maxList);
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
|
||||
|
||||
static long log2i(long n) {
|
||||
long l = 0;
|
||||
while (n>>=1) l++;
|
||||
return l;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+212
-588
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,10 +1,12 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "argcheck.h"
|
||||
#include "comm.h"
|
||||
|
||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
hipPointerAttribute_t attr;
|
||||
|
||||
@@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
|
||||
static nvmlReturn_t (*nvmlInternalShutdown)(void);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
|
||||
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
|
||||
@@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
|
||||
|
||||
// Used to make the NVML library calls thread safe
|
||||
pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
ncclResult_t wrapNvmlSymbols(void) {
|
||||
if (nvmlState == nvmlInitialized)
|
||||
@@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) {
|
||||
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
|
||||
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
|
||||
|
||||
nvmlState = nvmlInitialized;
|
||||
return ncclSuccess;
|
||||
@@ -85,6 +91,7 @@ teardown:
|
||||
nvmlInternalShutdown = NULL;
|
||||
nvmlInternalDeviceGetHandleByPciBusId = NULL;
|
||||
nvmlInternalDeviceGetIndex = NULL;
|
||||
nvmlInternalDeviceGetHandleByIndex = NULL;
|
||||
nvmlInternalDeviceGetPciInfo = NULL;
|
||||
nvmlInternalDeviceGetMinorNumber = NULL;
|
||||
nvmlInternalDeviceGetNvLinkState = NULL;
|
||||
@@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
@@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetIndex() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
@@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
|
||||
if (nvmlInternalDeviceGetHandleByIndex == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
|
||||
if (nvmlInternalDeviceGetPciInfo == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetPciInfo() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
@@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetMinorNumber() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
@@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
|
||||
/* Do not warn, this symbol is optional. */
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
||||
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
|
||||
@@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
|
||||
/* Do not warn, this symbol is optional. */
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
||||
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
|
||||
@@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
|
||||
/* Do not warn, this symbol is optional. */
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
if (ret != NVML_ERROR_NOT_SUPPORTED)
|
||||
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
|
||||
@@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
|
||||
if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -24,6 +24,7 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
|
||||
*index = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -32,6 +33,7 @@ ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
|
||||
*minorNumber = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -45,5 +47,11 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult) {
|
||||
*capResult = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
|
||||
*major = *minor = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1,398 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "param.h"
|
||||
|
||||
#define NCCL_MAX_SCORE 7
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
|
||||
* Rings with a non-matching number of ranks are ignored so we can provide
|
||||
* rings for multiple cases.
|
||||
*/
|
||||
#define MAX_ENV_RANKS 512
|
||||
static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
|
||||
int ranks[MAX_ENV_RANKS];
|
||||
int nrings = 0;
|
||||
int rank = 0;
|
||||
int offset = 0;
|
||||
int status = 0; // 0 : between numbers, 1 : inside number
|
||||
do {
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (status == 0) {
|
||||
ranks[rank] = digit;
|
||||
status = 1;
|
||||
} else {
|
||||
ranks[rank] = ranks[rank]*10+digit;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
rank++;
|
||||
if (rank == MAX_ENV_RANKS) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
int prevRank = ranks[rank-1];
|
||||
// Ignore rings if nranks doesn't match
|
||||
if (rank != nranks) goto newring;
|
||||
|
||||
for (int r=0; r<nranks; r++) {
|
||||
int rank = ranks[r];
|
||||
// Ignore rings with ranks out of bounds
|
||||
if (rank < 0 || rank >= nranks) goto newring;
|
||||
// Ignore rings with duplicate ranks
|
||||
for (int i=0; i<r; i++)
|
||||
if (ranks[i] == rank) goto newring;
|
||||
|
||||
next[nrings*nranks+prevRank] = rank;
|
||||
prev[nrings*nranks+rank] = prevRank;
|
||||
prevRank = rank;
|
||||
}
|
||||
nrings++;
|
||||
newring:
|
||||
rank = 0;
|
||||
}
|
||||
}
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
*nringsRet = nrings;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ring creation algorithm
|
||||
*
|
||||
* First, we establish hierarchical coordinates depending on the way ranks can
|
||||
* communicate. After fillCoords, we have for each rank a unique 3-int array
|
||||
* { node, pci_domain, rank } corresponding to the three transports :
|
||||
* { 2[NET], 1[SHM], 0[P2P] }.
|
||||
* Also, we renumber ranks (to indexes) based on their growing coordinates.
|
||||
*
|
||||
* Then, we ask transports to connect groups together. We start with net, then
|
||||
* shm, then p2p. We maintain two arrays, prev and next, where values are equal
|
||||
* to -1 when ranks are not yet connected, and a rank otherwise. We never
|
||||
* connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
|
||||
* ranks, if we are rank 13, we should see something like (provided we have a
|
||||
* single net interface, hence a single ring) :
|
||||
*
|
||||
* Connecting all nodes <13>
|
||||
* 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
|
||||
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
|
||||
*
|
||||
* Connecting P2P domains with shared memory <13>
|
||||
* 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
|
||||
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
|
||||
*
|
||||
* Connecting ranks (only inside the P2P domain) <13>
|
||||
* 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
|
||||
* next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
|
||||
*
|
||||
* Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
|
||||
* which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
|
||||
* risking to explode in terms of combinations, and we scale better.
|
||||
*
|
||||
* Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
|
||||
* we get at least one ring.
|
||||
*/
|
||||
|
||||
static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
|
||||
connected[rank] = 1;
|
||||
for (int r=0; r<nranks; r++) {
|
||||
if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
|
||||
recIsConnected(r, connected, nranks, matrix, transport);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
|
||||
for (int r=0; r<nranks; r++) connected[r] = 0;
|
||||
recIsConnected(rank, connected, nranks, matrix, transport);
|
||||
}
|
||||
|
||||
#define NEW_IDX(rank) do { \
|
||||
rankToIdx[rank] = idx; \
|
||||
idxToRank[idx] = rank; \
|
||||
for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
|
||||
idx++; \
|
||||
} while (0)
|
||||
|
||||
int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
|
||||
for (int r=0; r<nranks; r++) {
|
||||
if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
|
||||
int current[NTRANSPORTS];
|
||||
int* p2pConnected;
|
||||
NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
|
||||
for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
|
||||
int curRank = 0, idx = 0;
|
||||
while (1) {
|
||||
// P2P is handled separately as there is no level below it and we need to
|
||||
// cover the case of being connected to another GPU indirectly.
|
||||
// So we detect all GPUs in the same P2P domain once and add them all at
|
||||
// once.
|
||||
isConnected(curRank, p2pConnected, nranks, matrix, 0);
|
||||
for (int r=0; r<nranks; r++) {
|
||||
if (p2pConnected[r]) {
|
||||
NEW_IDX(r);
|
||||
curRank = r;
|
||||
current[0]++;
|
||||
}
|
||||
}
|
||||
current[0] = 0;
|
||||
|
||||
if (idx == nranks) {
|
||||
free(p2pConnected);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Find next group, either connected through SHM or NET.
|
||||
int rank;
|
||||
int transport = 1;
|
||||
while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
|
||||
current[transport] = 0;
|
||||
transport++;
|
||||
if (transport == NTRANSPORTS) {
|
||||
WARN("Error : Could not find transport to connect next group\n");
|
||||
free(p2pConnected);
|
||||
return ncclInternalError; }
|
||||
}
|
||||
curRank = rank;
|
||||
current[transport]++;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define DEFAULT_MIN_NRINGS 2
|
||||
#elif defined(__PPC__)
|
||||
// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
|
||||
#define DEFAULT_MIN_NRINGS 4
|
||||
#else
|
||||
#define DEFAULT_MIN_NRINGS 0
|
||||
#endif
|
||||
NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS);
|
||||
NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
|
||||
|
||||
/* Users can force the number of threads with an environment variable */
|
||||
NCCL_PARAM(Nthreads, "NTHREADS", -2);
|
||||
ncclResult_t getEnvThreads(int* nthreads) {
|
||||
int64_t nt = ncclParamNthreads();
|
||||
if (nt != -2)
|
||||
*nthreads = nt;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
|
||||
if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
|
||||
for (int r=nrings; r<newNrings; r++) {
|
||||
for (int i=0; i<nranks; i++) {
|
||||
a[r*nranks+i] = a[(r-nrings)*nranks+i];
|
||||
b[r*nranks+i] = b[(r-nrings)*nranks+i];
|
||||
c[r*nranks+i] = c[(r-nrings)*nranks+i];
|
||||
d[r*nranks+i] = d[(r-nrings)*nranks+i];
|
||||
}
|
||||
}
|
||||
return newNrings;
|
||||
}
|
||||
/* Main ring creation function */
|
||||
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
|
||||
*nrings = 0;
|
||||
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
|
||||
char* str = getenv("NCCL_RINGS");
|
||||
if (str && strlen(str)>0) {
|
||||
int ret = parseRings(str, nrings, nranks, prev, next);
|
||||
if (ret == ncclSuccess && *nrings > 0) {
|
||||
if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
|
||||
NCCLCHECK(getEnvThreads(nthreads));
|
||||
for (int r = 0; r<*nrings; r++) {
|
||||
for (int i = 0; i<nranks; i++) {
|
||||
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
|
||||
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
|
||||
*nrings = 0;
|
||||
}
|
||||
|
||||
// Compute hierarchical topology groups, indexes, and rank<->index tables
|
||||
int* coords, *globalIdxToRank, *globalRankToIdx;
|
||||
NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
|
||||
for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
|
||||
NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
|
||||
NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
|
||||
|
||||
NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
|
||||
|
||||
// Start with a high score, then decrease until we find rings
|
||||
int minScore = NCCL_MAX_SCORE;
|
||||
int nringsTmp;
|
||||
int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
|
||||
NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&idxToRank, nranks));
|
||||
NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
|
||||
NCCLCHECK(ncclCalloc(&groups, nranks));
|
||||
NCCLCHECK(ncclCalloc(&subgroups, nranks));
|
||||
|
||||
int nThreads;
|
||||
do {
|
||||
nThreads = *nthreads;
|
||||
for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
|
||||
nringsTmp = MAXCHANNELS;
|
||||
// Loop over transports to connect groups
|
||||
for (int t=NTRANSPORTS-1; t>=0; t--) {
|
||||
for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
|
||||
|
||||
int nidx = 0;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
// Extract only ranks in the same local area as rank
|
||||
// We need to extract them in the topological order, hence we iterate over indexes, not ranks
|
||||
int r = globalIdxToRank[i];
|
||||
int sameLocal = 1;
|
||||
for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
|
||||
if (!sameLocal) continue;
|
||||
|
||||
groups[nidx] = coords[r*NTRANSPORTS+t];
|
||||
subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
|
||||
rankToIdx[r] = nidx;
|
||||
idxToRank[nidx] = r;
|
||||
nidx++;
|
||||
}
|
||||
|
||||
int ngroups = groups[nidx-1] + 1; // Coords should be ordered
|
||||
|
||||
ncclTvalue_t* subvalues;
|
||||
int *subprev, *subnext;
|
||||
NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
|
||||
NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
|
||||
NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
|
||||
if (ngroups > 1) {
|
||||
/* Extract subvalues */
|
||||
for (int i=0; i<nidx; i++) {
|
||||
for (int j=0; j<nidx; j++) {
|
||||
if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
|
||||
subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
|
||||
else
|
||||
subvalues[i*nidx+j] = 0;
|
||||
}
|
||||
}
|
||||
/* Extract subprev/subnext */
|
||||
for (int i=0; i<nidx*nringsTmp; i++) {
|
||||
subprev[i] = subnext[i] = -1;
|
||||
}
|
||||
for (int r=0; r<nringsTmp; r++) {
|
||||
int start = -1, end = -1;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
if (rankToIdx[i] == -1) continue;
|
||||
if (prevTmp[r*nranks+i] != -1) start = i;
|
||||
if (nextTmp[r*nranks+i] != -1) end = i;
|
||||
}
|
||||
if (start != -1 && end != -1) {
|
||||
subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
|
||||
subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
|
||||
}
|
||||
}
|
||||
/* Get rings */
|
||||
NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
|
||||
/* Merge subprev/subnext into prev/next */
|
||||
for (int r=0; r<nringsTmp; r++) {
|
||||
for (int i=0; i<nidx; i++) {
|
||||
if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
|
||||
if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
|
||||
if (t == NTRANSPORTS-1) {
|
||||
// Save node-level masters for trees
|
||||
treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
|
||||
treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
//for (int r=0; r<nringsTmp; r++) {
|
||||
//printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
|
||||
//printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
|
||||
//}
|
||||
}
|
||||
free(subvalues);
|
||||
free(subprev);
|
||||
free(subnext);
|
||||
if (nringsTmp == 0) break;
|
||||
}
|
||||
minScore--;
|
||||
if (nringsTmp > *nrings) {
|
||||
*nrings = nringsTmp;
|
||||
for (int i=0; i<nranks*(*nrings); i++) {
|
||||
prev[i] = prevTmp[i];
|
||||
next[i] = nextTmp[i];
|
||||
}
|
||||
}
|
||||
} while (nringsTmp == 0 && minScore);
|
||||
|
||||
free(coords);
|
||||
free(globalRankToIdx);
|
||||
free(globalIdxToRank);
|
||||
free(prevTmp);
|
||||
free(nextTmp);
|
||||
free(idxToRank);
|
||||
free(rankToIdx);
|
||||
free(groups);
|
||||
free(subgroups);
|
||||
|
||||
*nthreads = nThreads;
|
||||
|
||||
/* Duplicate the rings in case of multinode+NVLink */
|
||||
int nnodes = 0;
|
||||
for (int r=0; r<nranks; r++) nnodes += treeIn[r];
|
||||
int nvlink;
|
||||
NCCLCHECK(ncclNvlinkGpu(&nvlink));
|
||||
if (nnodes > 1 && nvlink) {
|
||||
*nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
|
||||
}
|
||||
|
||||
if (*nrings == 0) {
|
||||
WARN("Could not create rings, falling back on simple ring");
|
||||
*nrings = 1;
|
||||
prev[rank] = (rank-1+nranks) % nranks;
|
||||
next[rank] = (rank+1)%nranks;
|
||||
}
|
||||
|
||||
int maxNrings = ncclParamMaxNrings();
|
||||
int minNrings = ncclParamMinNrings();
|
||||
if (maxNrings > 0 && minNrings > maxNrings) {
|
||||
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
|
||||
minNrings = 0;
|
||||
}
|
||||
if (minNrings > MAXCHANNELS) {
|
||||
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
|
||||
minNrings = MAXCHANNELS;
|
||||
}
|
||||
if (maxNrings > 0 && maxNrings <= *nrings) {
|
||||
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
|
||||
*nrings = maxNrings;
|
||||
} else {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int defaultMinNrings = 1;
|
||||
#else
|
||||
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
|
||||
#endif
|
||||
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
|
||||
if (minNrings > 0 && minNrings > *nrings) {
|
||||
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
|
||||
*nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(getEnvThreads(nthreads));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||
char busId[BUSID_SIZE];
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||
*path = realpath(busPath, NULL);
|
||||
if (*path == NULL) {
|
||||
WARN("Could not find real path of %s", busPath);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
|
||||
|
||||
int pciDistance(char* path1, char* path2) {
|
||||
int score = 0;
|
||||
int depth = 0;
|
||||
int same = 1;
|
||||
for (int i=0; i<strlen(path1); i++) {
|
||||
if (path1[i] != path2[i]) same = 0;
|
||||
if (path1[i] == '/') {
|
||||
depth++;
|
||||
if (same == 1) score++;
|
||||
}
|
||||
}
|
||||
if (score <= 3) {
|
||||
#ifdef __PPC__
|
||||
// NUMA distance detection and PATH_SYS not supported on IBM/Power nodes
|
||||
// nodes currently
|
||||
return PATH_NODE;
|
||||
#else
|
||||
/* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
|
||||
int numaId1 = getNumaId(path1);
|
||||
int numaId2 = getNumaId(path2);
|
||||
TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
|
||||
return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
|
||||
#endif
|
||||
}
|
||||
if (score == 4) return PATH_PHB;
|
||||
if (score == depth-1) return PATH_PIX;
|
||||
return PATH_PXB;
|
||||
}
|
||||
+75
-99
@@ -6,32 +6,54 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "utils.h"
|
||||
#include "debug.h"
|
||||
#include "nccl_net.h"
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "nvmlwrap.h"
|
||||
#include "core.h"
|
||||
|
||||
#include "nvmlwrap.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
// Get current Compute Capability
|
||||
int ncclCudaCompCap() {
|
||||
int cudaDev;
|
||||
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
|
||||
int ccMajor, ccMinor;
|
||||
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
|
||||
if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
|
||||
return ccMajor*10+ccMinor;
|
||||
}
|
||||
|
||||
ncclResult_t int64ToBusId(int64_t id, char* busId) {
|
||||
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t busIdToInt64(char* busId, int64_t* id) {
|
||||
const int size = strlen(busId);
|
||||
char* hexStr;
|
||||
NCCLCHECK(ncclCalloc(&hexStr, size));
|
||||
int hexOffset = 0;
|
||||
for (int i=0; i<size; i++) {
|
||||
char c = busId[i];
|
||||
if (c == '.' || c == ':') continue;
|
||||
if ((c >= '0' && c <= '9') ||
|
||||
(c >= 'A' && c <= 'F') ||
|
||||
(c >= 'a' && c <= 'f')) {
|
||||
hexStr[hexOffset++] = busId[i];
|
||||
} else break;
|
||||
}
|
||||
hexStr[hexOffset] = '\0';
|
||||
*id = strtol(hexStr, NULL, 16);
|
||||
free(hexStr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Convert a logical cudaDev index to the NVML device minor number
|
||||
ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// assign nmvlDev to be same as cudaDev to avoid garbage numbers
|
||||
*nvmlDev = cudaDev;
|
||||
#else
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
nvmlDevice_t nvmlDevice;
|
||||
unsigned int dev;
|
||||
*nvmlDev = -1;
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
|
||||
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
|
||||
NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
|
||||
|
||||
*nvmlDev = dev;
|
||||
#endif
|
||||
|
||||
ncclResult_t getBusId(int cudaDev, int64_t *busId) {
|
||||
// On most systems, the PCI bus ID comes back as in the 0000:00:00.0
|
||||
// format. Still need to allocate proper space in case PCI domain goes
|
||||
// higher.
|
||||
char busIdStr[] = "00000000:00:00.0";
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
|
||||
NCCLCHECK(busIdToInt64(busIdStr, busId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -46,53 +68,6 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Common logging function used by the INFO, WARN and TRACE macros
|
||||
* Also exported to the dynamically loadable Net transport modules so
|
||||
* they can share the debugging mechanisms and output files
|
||||
*/
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
|
||||
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
int cudaDev;
|
||||
hipGetDevice(&cudaDev);
|
||||
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
pthread_mutex_lock(&ncclDebugOutputLock);
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
#endif
|
||||
if (len) {
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
fprintf(ncclDebugFile,"%s\n", buffer);
|
||||
fflush(ncclDebugFile);
|
||||
}
|
||||
pthread_mutex_unlock(&ncclDebugOutputLock);
|
||||
|
||||
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
|
||||
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
|
||||
hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t getHash(const char* string, int n) {
|
||||
// Based on DJB2, result = result * 33 + char
|
||||
uint64_t result = 5381;
|
||||
@@ -102,40 +77,43 @@ uint64_t getHash(const char* string, int n) {
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t getnHash(const char* string, int n) {
|
||||
// Based on DJB2, result = result * 33 + char
|
||||
uint64_t result = 9527;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) + string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Generate a hash of the unique identifying string for this host
|
||||
* that will be unique for both bare-metal and container instances
|
||||
* Equivalent of a hash of;
|
||||
*
|
||||
* $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
|
||||
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
|
||||
*
|
||||
* This string can be overridden by using the NCCL_HOSTID env var.
|
||||
*/
|
||||
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
||||
uint64_t getHostHash(void) {
|
||||
char uname[1024];
|
||||
// Start off with the full hostname
|
||||
(void) getHostName(uname, sizeof(uname), '\0');
|
||||
int offset = strlen(uname);
|
||||
int len;
|
||||
// $(readlink /proc/self/ns/uts)
|
||||
len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
|
||||
if (len < 0) len = 0;
|
||||
offset += len;
|
||||
// $(readlink /proc/self/ns/mnt)
|
||||
len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
|
||||
if (len < 0) len = 0;
|
||||
offset += len;
|
||||
// Trailing '\0'
|
||||
uname[offset]='\0';
|
||||
TRACE(NCCL_INIT,"unique hostname '%s'", uname);
|
||||
char hostHash[1024];
|
||||
char *hostId;
|
||||
|
||||
return getHash(uname, strlen(uname));
|
||||
// Fall back is the full hostname if something fails
|
||||
(void) getHostName(hostHash, sizeof(hostHash), '\0');
|
||||
int offset = strlen(hostHash);
|
||||
|
||||
if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
|
||||
strncpy(hostHash, hostId, sizeof(hostHash));
|
||||
} else {
|
||||
FILE *file = fopen(HOSTID_FILE, "r");
|
||||
if (file != NULL) {
|
||||
char *p;
|
||||
if (fscanf(file, "%ms", &p) == 1) {
|
||||
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
||||
free(p);
|
||||
}
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
// Make sure the string is terminated
|
||||
hostHash[sizeof(hostHash)-1]='\0';
|
||||
|
||||
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
|
||||
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
|
||||
/* Generate a hash of the unique identifying string for this process
|
||||
@@ -162,8 +140,6 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
|
||||
if (!string) return 0;
|
||||
|
||||
const char* ptr = string;
|
||||
// Ignore "^" or "=" prefix, will be detected outside of this function
|
||||
if (ptr[0] == '^' || ptr[0] == '=') ptr++;
|
||||
|
||||
int ifNum = 0;
|
||||
int ifC = 0;
|
||||
|
||||
@@ -44,7 +44,7 @@ typedef enum { ncclSuccess = 0,
|
||||
* This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int *version);
|
||||
ncclResult_t ncclGetVersion(int *version);
|
||||
ncclResult_t pncclGetVersion(int *version);
|
||||
|
||||
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
@@ -248,7 +248,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
|
||||
* Start a group call. All subsequent calls to NCCL may not block due to
|
||||
* inter-CPU synchronization.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
ncclResult_t ncclGroupStart();
|
||||
ncclResult_t pncclGroupStart();
|
||||
|
||||
/*
|
||||
* Group End
|
||||
@@ -256,7 +257,8 @@ ncclResult_t ncclGroupStart();
|
||||
* End a group call. Wait for all calls since ncclGroupStart to complete
|
||||
* before returning.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
ncclResult_t ncclGroupEnd();
|
||||
ncclResult_t pncclGroupEnd();
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -120,13 +121,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
|
||||
}
|
||||
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree up
|
||||
struct ncclTree* tree = &args->channel->tree;
|
||||
struct ncclTree* tree = &args->channel->treeUp;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree down
|
||||
struct ncclTree* tree = &args->channel->tree;
|
||||
struct ncclTree* tree = &args->channel->treeDn;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
}
|
||||
@@ -158,7 +159,9 @@ void* persistentThread(void *comm_) {
|
||||
}
|
||||
} while (op == NULL);
|
||||
op->idle = 0;
|
||||
if (op->state != ncclProxyOpNone) ret = op->progress(op);
|
||||
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
|
||||
// yet and might be cancelled before they even start. Hold on on those.
|
||||
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
|
||||
+78
-246
@@ -5,39 +5,9 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "transport.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "param.h"
|
||||
#include "topo.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define NET_MAX_IFS 16
|
||||
#define NET_MAX_GPUS 32
|
||||
|
||||
// Cache GPU-NIC distances to avoid re-computing them
|
||||
#define NET_TVALUE_UNKNOWN 0ULL
|
||||
static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
|
||||
static int ncclNetNDev;
|
||||
|
||||
// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
|
||||
#define NET_BITS_PER_IF 3
|
||||
#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
|
||||
static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
|
||||
static ncclTvalue_t getTvalue(short* distances, int ndev) {
|
||||
ncclTvalue_t tvalue = 0;
|
||||
for (int d=0; d<ndev; d++) {
|
||||
ncclTvalue_t score = 1 + PATH_SYS - distances[d];
|
||||
// Keep 3 bits of score info per dev
|
||||
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
||||
}
|
||||
return tvalue;
|
||||
}
|
||||
static int getScore(ncclTvalue_t tvalue, int dev) {
|
||||
return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
|
||||
}
|
||||
#include "graph.h"
|
||||
|
||||
struct netConnectInfo {
|
||||
ncclNetHandle_t netHandle;
|
||||
@@ -54,6 +24,7 @@ struct netSendResources {
|
||||
int buffSize;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* ll128Mhandle;
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
@@ -71,234 +42,62 @@ struct netRecvResources {
|
||||
int buffSize;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* ll128Mhandle;
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
|
||||
char* cudaPath = NULL;
|
||||
char* nicPath = NULL;
|
||||
ncclResult_t err;
|
||||
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
||||
err = ncclNetPciPath(dev, &nicPath);
|
||||
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
|
||||
if (nicPath) free(nicPath);
|
||||
if (cudaPath) free(cudaPath);
|
||||
/* Determine if two peers can communicate with NET */
|
||||
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netDevices(int* ndev, short** distances) {
|
||||
NCCLCHECK(ncclNetDevices(ndev));
|
||||
if (*ndev == 0) {
|
||||
WARN("Error : Network returned 0 device");
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
|
||||
|
||||
*distances = (short*)malloc(*ndev*sizeof(short));
|
||||
if (*distances == NULL) return ncclSystemError;
|
||||
|
||||
// Find distance with current GPU
|
||||
int cudaDev, nvmlDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
|
||||
char line[1024];
|
||||
sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
|
||||
for (int d=0; d<*ndev; d++) {
|
||||
NCCLCHECK(netDistance(cudaDev, d, *distances+d));
|
||||
sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_NET, "%s", line);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Determine if we can communicate with the peer */
|
||||
ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
ret[0] = ncclNetTvalues[cudaDev];
|
||||
if (ret[0] == NET_TVALUE_UNKNOWN) {
|
||||
if (cudaDev >= NET_MAX_GPUS) {
|
||||
WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int nDev;
|
||||
short* distances;
|
||||
NCCLCHECK(netDevices(&nDev, &distances));
|
||||
ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
|
||||
ncclNetNDev = nDev;
|
||||
free(distances);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
|
||||
int bestRank = -1;
|
||||
int bestScore = 0;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
if (groups[rank] != group) continue;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
ncclTvalue_t netValue = values[rank*nranks+i];
|
||||
if (netValue != 0) {
|
||||
ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
|
||||
if (score >= minScore && score > bestScore) {
|
||||
bestScore = score;
|
||||
bestRank = rank;
|
||||
}
|
||||
// All other values should be the same, stop here for this rank
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bestRank;
|
||||
}
|
||||
static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
|
||||
// For the last rank, we don't need the absolute best score, just to be within minScore.
|
||||
for (int rank=nranks-1; rank>=0; rank--) {
|
||||
if (groups[rank] != group) continue;
|
||||
if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
|
||||
if (startRank == rank) continue;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
ncclTvalue_t netValue = values[rank*nranks+i];
|
||||
if (netValue != 0) {
|
||||
ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
|
||||
if (score >= minScore) {
|
||||
return rank;
|
||||
}
|
||||
// All other values should be the same, stop here for this rank
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||
int nGroups = groups[nranks-1] + 1;
|
||||
int *cardUsed, *starts, *ends;
|
||||
NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
|
||||
NCCLCHECK(ncclCalloc(&starts, nGroups));
|
||||
NCCLCHECK(ncclCalloc(&ends, nGroups));
|
||||
|
||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||
for (int group = 0; group<nGroups; group++) {
|
||||
int nranksInGroup = 0;
|
||||
int nsubGroups = 0;
|
||||
for (int rank=0; rank<nranks; rank++)
|
||||
if (groups[rank] == group) {
|
||||
nranksInGroup++;
|
||||
nsubGroups = std::max(subgroups[rank], nsubGroups);
|
||||
}
|
||||
starts[group] = ends[group] = -1;
|
||||
// Receive on the rank closest to the NIC
|
||||
for (int card=0; card<NET_MAX_IFS; card++) {
|
||||
if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
|
||||
int start = groupBestStart(nranks, groups, group, values, card, minScore);
|
||||
// Send from any rank, but best on a different subgroup and close to the NIC also.
|
||||
int end = (nranksInGroup == 1) ? start
|
||||
: groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
|
||||
//printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
|
||||
if (start != -1 && end != -1) {
|
||||
cardUsed[group*NET_MAX_IFS+card] = 1;
|
||||
starts[group] = start;
|
||||
ends[group] = end;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (starts[group] == -1 || ends[group] == -1) {
|
||||
*nringsRet = ring;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
// Link groups together
|
||||
for (int group = 0; group<nGroups; group++) {
|
||||
int nextGroup = (group+1)%nGroups;
|
||||
next[ring*nranks+ends[group]] = starts[nextGroup];
|
||||
prev[ring*nranks+starts[nextGroup]] = ends[group];
|
||||
}
|
||||
}
|
||||
done:
|
||||
free(cardUsed);
|
||||
free(starts);
|
||||
free(ends);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int getDev(int cudaDev, int ringId) {
|
||||
ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
|
||||
|
||||
int dev = 0;
|
||||
int maxScore = 0;
|
||||
for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
|
||||
int skip = ringId+1;
|
||||
while (skip) {
|
||||
for (int d=0; d<ncclNetNDev; d++) {
|
||||
if (getScore(tvalues, d) == maxScore) {
|
||||
skip--;
|
||||
if (skip == 0) { dev = d; goto end; }
|
||||
}
|
||||
}
|
||||
}
|
||||
end:
|
||||
return dev;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
|
||||
|
||||
static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
|
||||
static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
|
||||
*useGdr = 0;
|
||||
|
||||
int cudaDev, nvmlDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
|
||||
|
||||
if (!hasFineGrainVramPcie()) {
|
||||
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
if (read) { // For reads (sends) only enable under certain conditions
|
||||
int gdrReadParam = ncclParamNetGdrRead();
|
||||
if (gdrReadParam == 0) return ncclSuccess;
|
||||
if (gdrReadParam < 0) {
|
||||
int nvlink;
|
||||
NCCLCHECK(ncclNvlinkGpu(&nvlink));
|
||||
NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
|
||||
if (!nvlink) return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we are close enough that it makes sense to enable GDR
|
||||
int netGdrLevel = ncclParamNetGdrLevel();
|
||||
short distance;
|
||||
NCCLCHECK(netDistance(cudaDev, dev, &distance));
|
||||
int distance;
|
||||
NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
|
||||
if (distance >= netGdrLevel) {
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Finally, check if the NIC supports it
|
||||
int flags;
|
||||
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
|
||||
NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
|
||||
if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
|
||||
*useGdr = 1;
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Determine if we will use this transport for this peer and return connect
|
||||
* information for this peer */
|
||||
ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
struct netSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
resources->netDev = getDev(cudaDev, channelId);
|
||||
NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
|
||||
NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
||||
@@ -310,20 +109,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
||||
resources->buffSize = buffSize;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
struct netRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
resources->netDev = getDev(cudaDev, channelId);
|
||||
NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
|
||||
NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
||||
@@ -331,12 +128,11 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, cudaDev));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
||||
resources->buffSize = buffSize;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
||||
@@ -351,6 +147,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
send->conn.buff = recvMem->buff;
|
||||
send->conn.llBuff = resources->devHostRecvMem->llBuff;
|
||||
send->conn.ll128Buff = recvMem->ll128Buff;
|
||||
|
||||
// Head/Tail/Opcount/Fifos are always on host
|
||||
send->conn.tail = &resources->devHostRecvMem->tail;
|
||||
@@ -368,6 +165,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
|
||||
NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -381,6 +180,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
recv->conn.buff = recvMem->buff;
|
||||
recv->conn.llBuff = recvMem->llBuff;
|
||||
recv->conn.ll128Buff = recvMem->ll128Buff;
|
||||
|
||||
// Head/Tail/Opcount are always on host
|
||||
recv->conn.tail = &resources->devHostRecvMem->tail;
|
||||
@@ -396,6 +196,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -405,6 +207,7 @@ ncclResult_t netSendFree(void* transportResources) {
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
@@ -418,6 +221,7 @@ ncclResult_t netRecvFree(void* transportResources) {
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
@@ -445,7 +249,39 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
||||
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
|
||||
if (args->llMode) {
|
||||
if (args->protocol == NCCL_PROTO_LL128) {
|
||||
int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
|
||||
if (args->tail < *recvTail) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (sizesFifo[buffSlot] != -1) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = (char*)localMem->ll128Buff;
|
||||
int ready = resources->useGdr;
|
||||
if (!ready) {
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
// called threadfence()
|
||||
uint64_t flag = args->tail + 1;
|
||||
int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
|
||||
ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
|
||||
}
|
||||
}
|
||||
if (ready) {
|
||||
// Send through network
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
sizesFifo[buffSlot] = -1;
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (args->protocol == NCCL_PROTO_LL) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
if (size != -1) {
|
||||
@@ -471,17 +307,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
}
|
||||
} else if (args->tail < LOAD(recvTail)) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
// Send through network
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
if (sizesFifo[buffSlot] != -1) {
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -520,11 +358,11 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
args->idle = 1;
|
||||
int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
|
||||
int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
|
||||
if (args->head < args->end) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
|
||||
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
|
||||
char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
|
||||
void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
|
||||
volatile uint64_t* sendHead = &resources->hostSendMem->head;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
@@ -541,13 +379,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
|
||||
if (done) {
|
||||
args->head += args->sliceSteps;
|
||||
if (args->llMode == 0) {
|
||||
if (resources->useGdr) {
|
||||
ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
|
||||
// Flush local HDP register after local read-back finishes
|
||||
//STORE(resources->curr_hdp_reg, 0x1);
|
||||
//TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", resources->curr_hdp_reg);
|
||||
}
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
|
||||
STORE(&resources->hostRecvMem->tail, args->head);
|
||||
}
|
||||
args->idle = 0;
|
||||
@@ -566,7 +399,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
struct ncclTransport netTransport = {
|
||||
"NET",
|
||||
netCanConnect,
|
||||
netGetRings,
|
||||
{ netSendSetup, netSendConnect, netSendFree, netSendProxy },
|
||||
{ netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
|
||||
};
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "net.h"
|
||||
#include "topo.h"
|
||||
#include "graph.h"
|
||||
#include "utils.h"
|
||||
#include "param.h"
|
||||
|
||||
@@ -108,7 +108,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
char* userIbEnv = getenv("NCCL_IB_HCA");
|
||||
struct netIf userIfs[MAX_IB_DEVS];
|
||||
bool searchNot = userIbEnv && userIbEnv[0] == '^';
|
||||
if (searchNot) userIbEnv++;
|
||||
bool searchExact = userIbEnv && userIbEnv[0] == '=';
|
||||
if (searchExact) userIbEnv++;
|
||||
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
|
||||
|
||||
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
|
||||
@@ -204,32 +206,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
|
||||
#endif
|
||||
}
|
||||
if (moduleLoaded == 0) return ncclSystemError;
|
||||
ncclResult_t ret = ncclSystemError;
|
||||
void* ptr;
|
||||
if (hipMalloc(&ptr, sizeof(int)) == hipSuccess) {
|
||||
struct ibv_mr* mr;
|
||||
struct ibv_pd* pd;
|
||||
if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
|
||||
if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
|
||||
ret = ncclSuccess;
|
||||
wrap_ibv_dereg_mr(mr);
|
||||
}
|
||||
wrap_ibv_dealloc_pd(pd);
|
||||
}
|
||||
hipFree(ptr);
|
||||
}
|
||||
return ret;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
|
||||
*supportedTypes = NCCL_PTR_HOST;
|
||||
|
||||
int cudaDev, nvmlDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
|
||||
|
||||
if (ncclIbGdrSupport(dev) != ncclSuccess) {
|
||||
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
|
||||
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
|
||||
return ncclSuccess;
|
||||
}
|
||||
*supportedTypes |= NCCL_PTR_CUDA;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "net.h"
|
||||
@@ -109,6 +109,7 @@ struct ncclSocketRequest {
|
||||
void* data;
|
||||
int size;
|
||||
int ctrlFd;
|
||||
int offset;
|
||||
int used;
|
||||
struct ncclSocketComm* comm;
|
||||
struct ncclSocketTask* tasks[MAX_SOCKETS];
|
||||
@@ -194,7 +195,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||
}
|
||||
if (nThreads == -2 || nSocksPerThread == -2) {
|
||||
// Auto-detection
|
||||
int autoNt=1, autoNs=1;
|
||||
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
|
||||
char vendorPath[PATH_MAX];
|
||||
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
|
||||
char* rPath = realpath(vendorPath, NULL);
|
||||
@@ -214,6 +215,9 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||
if (strcmp(vendor, "0x1d0f") == 0) { // AWS
|
||||
autoNt = 2;
|
||||
autoNs = 8;
|
||||
} else if (strcmp(vendor, "0x1ae0") == 0) { // GCP
|
||||
autoNt = 4;
|
||||
autoNs = 1;
|
||||
}
|
||||
end:
|
||||
if (nThreads == -2) nThreads = autoNt;
|
||||
@@ -227,7 +231,7 @@ end:
|
||||
}
|
||||
*ns = nSocks;
|
||||
*nt = nThreads;
|
||||
INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
|
||||
if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -380,31 +384,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
r->size = data;
|
||||
r->offset = 0;
|
||||
r->used = 2; // done exchanging size
|
||||
// divide into subtasks
|
||||
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
|
||||
int chunkOffset = 0, i = 0;
|
||||
while (chunkOffset < r->size) {
|
||||
int chunkSize = std::min(taskSize, r->size-chunkOffset);
|
||||
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
|
||||
chunkOffset += chunkSize;
|
||||
if (r->comm->nSocks > 0) {
|
||||
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
|
||||
while (chunkOffset < r->size) {
|
||||
int chunkSize = std::min(taskSize, r->size-chunkOffset);
|
||||
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
|
||||
chunkOffset += chunkSize;
|
||||
}
|
||||
}
|
||||
r->nSubs = i;
|
||||
}
|
||||
if (r->used == 2) { // already exchanged size
|
||||
int nCompleted = 0;
|
||||
for (int i=0; i<r->nSubs; i++) {
|
||||
struct ncclSocketTask* sub = r->tasks[i];
|
||||
if (sub->result != ncclSuccess) return sub->result;
|
||||
if (sub->offset == sub->size) nCompleted++;
|
||||
}
|
||||
if (nCompleted == r->nSubs) {
|
||||
if (size) *size = r->size;
|
||||
*done = 1;
|
||||
r->used = 0;
|
||||
if (r->nSubs > 0) {
|
||||
int nCompleted = 0;
|
||||
for (int i=0; i<r->nSubs; i++) {
|
||||
struct ncclSocketTask* sub = r->tasks[i];
|
||||
sub->used = 0;
|
||||
if (sub->result != ncclSuccess) return sub->result;
|
||||
if (sub->offset == sub->size) nCompleted++;
|
||||
}
|
||||
if (nCompleted == r->nSubs) {
|
||||
if (size) *size = r->size;
|
||||
*done = 1;
|
||||
r->used = 0;
|
||||
for (int i=0; i<r->nSubs; i++) {
|
||||
struct ncclSocketTask* sub = r->tasks[i];
|
||||
sub->used = 0;
|
||||
}
|
||||
}
|
||||
} else { // progress request using main thread
|
||||
if (r->offset < r->size) {
|
||||
NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset));
|
||||
}
|
||||
if (r->offset == r->size) {
|
||||
if (size) *size = r->size;
|
||||
*done = 1;
|
||||
r->used = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+68
-412
@@ -5,20 +5,12 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "comm.h"
|
||||
#include "graph.h"
|
||||
#include "utils.h"
|
||||
#include "topo.h"
|
||||
#include "transport.h"
|
||||
#include "param.h"
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <ctype.h>
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#include "nvlink_stub.h"
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#else
|
||||
#include "nvlink.h"
|
||||
#endif
|
||||
#include "shm.h"
|
||||
|
||||
@@ -59,29 +51,34 @@ NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
|
||||
NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
|
||||
|
||||
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
|
||||
static int busIdToCudaDev(const char* busId) {
|
||||
static int busIdToCudaDev(int64_t busId) {
|
||||
int ndev;
|
||||
if (hipGetDeviceCount(&ndev) != hipSuccess)
|
||||
return -1;
|
||||
for (int i = 0; i < ndev; i++) {
|
||||
char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
if (hipDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess)
|
||||
char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
if (hipDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess)
|
||||
return -1;
|
||||
if (strcmp(busId, devBusId) == 0) {
|
||||
return i;
|
||||
}
|
||||
int64_t devBusId;
|
||||
NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId));
|
||||
if (busId == devBusId) return i;
|
||||
}
|
||||
// BusId was not found in our locally visible CUDA devices
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Determine if we can communicate with the peer through p2p */
|
||||
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
||||
// Do not use P2P across root complexes by default (provided CUDA permits it)
|
||||
int p2pLevel = PATH_NODE;
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
int cpuCount;
|
||||
NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
|
||||
// Do not use P2P across sockets by default (provided CUDA permits it).
|
||||
// When we are on a single socket, don't even use P2P through the CPU as
|
||||
// it should be able to sustain two flows to sysmem faster than PCI P2P.
|
||||
int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
|
||||
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
|
||||
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
|
||||
|
||||
// Disable P2P
|
||||
*ret = 0;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
@@ -91,415 +88,73 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
||||
if (p2pLevel == 0) return ncclSuccess;
|
||||
|
||||
// Rule out different nodes
|
||||
if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
||||
if (peerCudaDev == -1) {
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
if (cudaDev1 == -1 || cudaDev2 == -1) {
|
||||
// Peer's CUDA device is not visible in this process
|
||||
#if CUDART_VERSION >= 10010
|
||||
// But in CUDA 10.1 we can still communicate with 'invisible' devices
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
|
||||
// Check for NVLink/NVswitch including P2P access
|
||||
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
|
||||
if (nvlinkp2p > 0) {
|
||||
*ret = nvlinkp2p;
|
||||
int nvlink;
|
||||
NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
|
||||
if (nvlink > 0) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
|
||||
|
||||
// Do not detect topology if we're on the same GPU. Note this is not really supported.
|
||||
if (myInfo->cudaDev == peerCudaDev) {
|
||||
*ret = 1 + PATH_SYS;
|
||||
if (cudaDev1 == cudaDev2) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// See if CUDA can do P2P
|
||||
int p2p;
|
||||
if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != hipSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
|
||||
myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
|
||||
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
|
||||
cudaDev1, info1->busId, cudaDev2, info2->busId);
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (p2p == 0) return ncclSuccess;
|
||||
|
||||
int nvlink = 0;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
uint32_t link_type, hops;
|
||||
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
|
||||
if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) != hipSuccess) {
|
||||
p2p = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
|
||||
static unsigned long long link_status_print_once_mask = 0;
|
||||
if (!(link_status_print_once_mask & (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)))) {
|
||||
INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", myInfo->cudaDev, peerInfo->cudaDev,
|
||||
if (!(link_status_print_once_mask & (1 << (cudaDev1*8 + cudaDev2)))) {
|
||||
INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", cudaDev1, cudaDev2,
|
||||
link_type_name[link_type], hops);
|
||||
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
|
||||
link_status_print_once_mask |= (1 << (cudaDev1*8 + cudaDev2));
|
||||
}
|
||||
int nvlinkp2p = 0;
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
|
||||
if (hops == 1)
|
||||
nvlinkp2p = CONNECT_NVLINK;
|
||||
}
|
||||
#else
|
||||
// Check for NVLink/NVswitch
|
||||
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI)
|
||||
nvlink = (hops == 1);
|
||||
#else // Check for NVLink/NVswitch
|
||||
NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
|
||||
#endif
|
||||
if (nvlinkp2p > 0) {
|
||||
*ret = nvlinkp2p;
|
||||
if (nvlink > 0) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Finally compute the PCI distance and compare with the p2pLevel.
|
||||
char* myPath;
|
||||
char* peerPath;
|
||||
ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
|
||||
ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
|
||||
if (err1 == ncclSuccess && err2 == ncclSuccess) {
|
||||
int distance = pciDistance(myPath, peerPath);
|
||||
if (distance < p2pLevel) {
|
||||
*ret = 1 + PATH_SYS - distance;
|
||||
}
|
||||
int distance;
|
||||
NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
|
||||
if (distance < p2pLevel) {
|
||||
*ret = 1;
|
||||
}
|
||||
if (err1 == ncclSuccess) free(myPath);
|
||||
if (err2 == ncclSuccess) free(peerPath);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
|
||||
#define MAXGPUS_PCI 64
|
||||
|
||||
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
|
||||
int nrings = 0;
|
||||
ncclTvalue_t* line = matrix+current*n;
|
||||
inTheRing[current] = 1;
|
||||
int currentStep = (currentRing+1)*n-remaining;
|
||||
rings[currentStep-1] = current;
|
||||
if (remaining == 0) {
|
||||
int looprank = rings[currentRing*n];
|
||||
if (line[looprank] > 0) {
|
||||
if (currentRing+1 == nRingsMax) {
|
||||
nrings = 1;
|
||||
} else {
|
||||
line[looprank]--;
|
||||
for (int i=0; i<n; i++) inTheRing[i] = 0;
|
||||
if (connect) {
|
||||
// First two slots are already set and we need to respect those constraints
|
||||
inTheRing[rings[currentStep]] = 1;
|
||||
nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
|
||||
} else {
|
||||
rings[(currentRing+1)*n] = 0;
|
||||
nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
|
||||
}
|
||||
line[looprank]++;
|
||||
for (int i=0; i<n; i++) inTheRing[i] = 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
|
||||
int maxStep = 0;
|
||||
for (int i=0; i<n; i++) {
|
||||
if (inTheRing[i] == 0 && line[i] > 0) {
|
||||
line[i]--;
|
||||
int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
|
||||
if (nr > nrings) {
|
||||
nrings = nr;
|
||||
maxStep = (nr+currentRing)*n;
|
||||
ringsSave[currentStep] = i;
|
||||
// Save the rest of the rings
|
||||
for (int r=currentStep+1; r<maxStep; r++) {
|
||||
ringsSave[r] = rings[r];
|
||||
}
|
||||
if (nrings + currentRing == nRingsMax) {
|
||||
// We found an optimal solution. Let's stop there.
|
||||
break;
|
||||
}
|
||||
}
|
||||
line[i]++;
|
||||
}
|
||||
}
|
||||
for (int r=currentStep; r<maxStep; r++) {
|
||||
rings[r] = ringsSave[r];
|
||||
}
|
||||
}
|
||||
inTheRing[current] = 0;
|
||||
return nrings;
|
||||
}
|
||||
|
||||
static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
|
||||
if (nrings == 0) return 0;
|
||||
// Copy rings by dup times
|
||||
if (newNrings > MAXCHANNELS) {
|
||||
newNrings = MAXCHANNELS;
|
||||
}
|
||||
for (int r=nrings; r<newNrings; r++) {
|
||||
for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
|
||||
}
|
||||
return newNrings;
|
||||
}
|
||||
|
||||
int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
|
||||
int* inTheRing = (int*)malloc(sizeof(int)*nranks);
|
||||
if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
|
||||
for (int i=0; i<nranks; i++) inTheRing[i] = 0;
|
||||
int nrings;
|
||||
if (connect) {
|
||||
inTheRing[rings[0]] = 1;
|
||||
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
|
||||
} else {
|
||||
rings[0] = 0;
|
||||
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
|
||||
}
|
||||
free(inTheRing);
|
||||
return nrings;
|
||||
}
|
||||
|
||||
static inline int findConnect(int nranks, int* ranks) {
|
||||
for (int i = 0; i<nranks; i++) {
|
||||
if (ranks[i] != -1) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
|
||||
if (nrings == 0) return 0;
|
||||
if (nrings > MAXCHANNELS) {
|
||||
WARN("Max rings reached, limiting to %d", MAXCHANNELS);
|
||||
nrings = MAXCHANNELS;
|
||||
}
|
||||
// Find existing constraints / connections
|
||||
int connect = 0;
|
||||
for (int r=0; r<nrings; r++) {
|
||||
int start = findConnect(nranks, prev+r*nranks);
|
||||
int end = findConnect(nranks, next+r*nranks);
|
||||
if (start != -1 && end != -1) {
|
||||
rings[r*nranks] = end;
|
||||
rings[r*nranks+1] = start;
|
||||
connect = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute rings
|
||||
ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
|
||||
if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
|
||||
for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
|
||||
matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
|
||||
|
||||
int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
|
||||
|
||||
free(matrix);
|
||||
|
||||
if (oversubscribe || connect) return compNrings;
|
||||
|
||||
if (compNrings && compNrings < nrings && nranks <= 4) {
|
||||
// Try to oversubscribe to get a better result
|
||||
int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
|
||||
if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
|
||||
for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
|
||||
int nThreads = *nthreads;
|
||||
int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
|
||||
if (compNrings2 > compNrings*2) {
|
||||
// Oversubscription worked.
|
||||
for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
|
||||
compNrings = compNrings2;
|
||||
}
|
||||
free(rings2);
|
||||
}
|
||||
|
||||
// Duplicate the rings for direct NVLink
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
|
||||
#else
|
||||
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
|
||||
#endif
|
||||
|
||||
return compNrings;
|
||||
}
|
||||
|
||||
int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
|
||||
int nrings = nringsStart;
|
||||
int connect = 0;
|
||||
for (int r=0; r<nrings; r++) {
|
||||
int start = findConnect(nranks, prev+r*nranks);
|
||||
int end = findConnect(nranks, next+r*nranks);
|
||||
if (start != -1 && end != -1) {
|
||||
rings[r*nranks] = end;
|
||||
rings[r*nranks+1] = start;
|
||||
int cur = start;
|
||||
for (int i=2; i<nranks; i++) {
|
||||
int next = (cur+1) % nranks;
|
||||
while (next == end || next == start) next = (next+1) % nranks;
|
||||
if (values[cur*nranks+next] < minScore) {
|
||||
return 0;
|
||||
}
|
||||
rings[r*nranks+i] = next;
|
||||
cur = next;
|
||||
}
|
||||
connect = 1;
|
||||
} else {
|
||||
if (connect == 1 && r > 0) {
|
||||
WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
|
||||
return r;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nrings;
|
||||
}
|
||||
|
||||
int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
|
||||
for (int r=0; r<nringsStart; r++) {
|
||||
for (int i=0; i<nranks; i++) {
|
||||
rings[r*nranks+i] = i;
|
||||
}
|
||||
}
|
||||
return nringsStart;
|
||||
}
|
||||
|
||||
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
|
||||
for (int score = PATH_SYS+1; score >= minScore; score--) {
|
||||
int best = -1;
|
||||
int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
|
||||
for (int n = 0; n < nranks; n++) {
|
||||
if (inRing[n]) continue;
|
||||
if (values[rank*nranks+n] == score) {
|
||||
if (end == -1) return n;
|
||||
if (values[end*nranks+n] < worst_end_score) {
|
||||
best = n;
|
||||
worst_end_score = values[end*nranks+n];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (best != -1) return best;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
|
||||
int connect = 0;
|
||||
for (int r=0; r<nrings; r++) {
|
||||
int start = findConnect(nranks, prev+r*nranks);
|
||||
int end = findConnect(nranks, next+r*nranks);
|
||||
|
||||
int inRing[MAXGPUS_PCI];
|
||||
for (int i=0; i<nranks; i++) inRing[i] = 0;
|
||||
|
||||
if (start == -1 && end == -1) {
|
||||
if (connect == 1 && r > 0) {
|
||||
WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
|
||||
return r;
|
||||
}
|
||||
end = 0;
|
||||
inRing[end] = 1;
|
||||
start = findClosestPci(values, inRing, end, -1, nranks, minScore);
|
||||
if (start == -1) return r;
|
||||
} else if (start == -1 || end == -1) {
|
||||
WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
|
||||
return r;
|
||||
} else {
|
||||
connect = 1;
|
||||
}
|
||||
rings[r*nranks] = end;
|
||||
rings[r*nranks+1] = start;
|
||||
inRing[start] = inRing[end] = 1;
|
||||
int cur = start;
|
||||
for (int i=2; i<nranks; i++) {
|
||||
int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
|
||||
if (next == -1) return r;
|
||||
|
||||
inRing[next] = 1;
|
||||
rings[r*nranks+i] = next;
|
||||
cur = next;
|
||||
}
|
||||
// Check the loop is closing
|
||||
inRing[end] = 0;
|
||||
if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
|
||||
|
||||
if (connect == 0) return 1;
|
||||
}
|
||||
return nrings;
|
||||
}
|
||||
|
||||
ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||
if (*nringsRet == 0) return ncclSuccess;
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
|
||||
for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
|
||||
int nrings = *nringsRet;
|
||||
|
||||
// NVswitch
|
||||
int nvswitchLinks = 0;
|
||||
int directLinks = 0;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
for (int j=1; j<nranks; j++) {
|
||||
int i = (rank + j) % nranks;
|
||||
ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
|
||||
if (j>1 && links != nvswitchLinks) {
|
||||
WARN("Internal error : NVswitch links mismatch");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvswitchLinks = links;
|
||||
}
|
||||
}
|
||||
if (nvswitchLinks) {
|
||||
// NVSwitch : Connect existing rings
|
||||
int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
|
||||
if (nringsConnected > 0) {
|
||||
nrings = nringsConnected;
|
||||
} else {
|
||||
nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
|
||||
// Or create new ones
|
||||
nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
|
||||
// And duplicate them
|
||||
nrings = copyRings(nranks, rings, nrings, nrings*2);
|
||||
}
|
||||
goto end;
|
||||
}
|
||||
|
||||
// point-to-point NVLink
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
int links = 0;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
ncclTvalue_t val = values[rank*nranks+i];
|
||||
if (val >= CONNECT_NVSWITCH) continue;
|
||||
links += val/CONNECT_NVLINK;
|
||||
}
|
||||
if (rank == 0) directLinks = links;
|
||||
else directLinks = std::min(directLinks, links);
|
||||
}
|
||||
if (directLinks > 0) {
|
||||
// NVLink : Connect rings or create new ones
|
||||
if (nranks > MAXGPUS_NVLINKP2P) {
|
||||
WARN("Recursive P2P computation cannot work for >8 GPUs");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
|
||||
goto end;
|
||||
}
|
||||
|
||||
// PCIe or QPI : Connect rings or create new ones
|
||||
nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
|
||||
|
||||
end:
|
||||
*nringsRet = nrings;
|
||||
for (int ring = 0; ring<nrings; ring++) {
|
||||
for (int index=0; index<nranks; index++) {
|
||||
int prevIndex = (index - 1 + nranks) % nranks;
|
||||
int nextIndex = (index + 1) % nranks;
|
||||
int curRank = rings[ring*nranks+index];
|
||||
int prevRank = rings[ring*nranks+prevIndex];
|
||||
int nextRank = rings[ring*nranks+nextIndex];
|
||||
if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
|
||||
if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
|
||||
}
|
||||
}
|
||||
|
||||
free(rings);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -513,7 +168,7 @@ end:
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
|
||||
/* Send: Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
struct p2pSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -522,6 +177,7 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
|
||||
|
||||
resources->next_hdp_reg = 0;
|
||||
uint32_t linktype, hops;
|
||||
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
|
||||
@@ -531,8 +187,6 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
|
||||
}
|
||||
else
|
||||
resources->next_hdp_reg = 0;
|
||||
|
||||
struct p2pConnectInfo info;
|
||||
info.id = channelId;
|
||||
@@ -549,19 +203,20 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
info.direct = 1;
|
||||
info.directPtr = resources->devMem;
|
||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
// Enable P2P access
|
||||
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||
if (err == hipErrorPeerAccessAlreadyEnabled) {
|
||||
hipGetLastError();
|
||||
} else if (err != hipSuccess) {
|
||||
WARN("failed to peer with device %d(=%d): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
|
||||
WARN("failed to peer with device %d(=%lx): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
|
||||
channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
@@ -570,12 +225,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
// Map IPC and enable P2P access
|
||||
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
|
||||
if (err != hipSuccess) {
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
|
||||
channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
}
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
@@ -584,7 +239,7 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
}
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
|
||||
|
||||
struct p2pRecvResources* resources;
|
||||
@@ -616,11 +271,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
if (err == hipErrorPeerAccessAlreadyEnabled) {
|
||||
hipGetLastError();
|
||||
} else if (err != hipSuccess) {
|
||||
WARN("failed to peer with device %d(=%d): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
|
||||
WARN("failed to peer with device %d(=%lx): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
@@ -629,11 +284,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
// Map IPC and enable P2P access
|
||||
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
|
||||
if (err != hipSuccess) {
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
}
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
@@ -669,6 +324,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
|
||||
|
||||
send->conn.buff = remDevMem->buff;
|
||||
send->conn.llBuff = remDevMem->llBuff;
|
||||
send->conn.ll128Buff = remDevMem->ll128Buff;
|
||||
send->conn.tail = &remDevMem->tail;
|
||||
send->conn.opCountRem = resources->devRemOpCount;
|
||||
send->conn.head = &resources->devMem->head;
|
||||
@@ -706,6 +362,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
|
||||
recv->conn.buff = resources->devMem->buff;
|
||||
recv->conn.llBuff = resources->devMem->llBuff;
|
||||
recv->conn.ll128Buff = resources->devMem->ll128Buff;
|
||||
recv->conn.tail = &resources->devMem->tail;
|
||||
recv->conn.opCountLoc = resources->devOpCount;
|
||||
recv->conn.head = &remDevMem->head;
|
||||
@@ -738,7 +395,6 @@ ncclResult_t p2pRecvFree(void* resources) {
|
||||
struct ncclTransport p2pTransport = {
|
||||
"P2P",
|
||||
p2pCanConnect,
|
||||
p2pGetRings,
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
|
||||
};
|
||||
|
||||
+18
-92
@@ -1,17 +1,11 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "transport.h"
|
||||
#include "param.h"
|
||||
#include "comm.h"
|
||||
#include "shm.h"
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
struct shmConnectInfo {
|
||||
uint64_t pidHash;
|
||||
@@ -41,98 +35,29 @@ struct shmRecvResources {
|
||||
|
||||
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
|
||||
|
||||
/* Determine if we can communicate with the peer */
|
||||
ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
||||
*ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
/* Determine two peers can communicate with SHM */
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
|
||||
static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
|
||||
for (int rank = 0; rank<nranks; rank++) {
|
||||
if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
|
||||
for (int rank = nranks-1; rank>=0; rank--) {
|
||||
if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
// Same host?
|
||||
TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
|
||||
#define MAXGROUPS 16
|
||||
// Common /dev/shm (between containers) ?
|
||||
TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
|
||||
if (info1->shmDev != info2->shmDev) return ncclSuccess;
|
||||
|
||||
*ret = 1;
|
||||
|
||||
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
|
||||
int nGroups = groups[nranks-1] + 1;
|
||||
int starts[MAXGROUPS];
|
||||
int ends[MAXGROUPS];
|
||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||
int startGroup = -1, endGroup = -1;
|
||||
for (int group = 0; group<nGroups; group++) {
|
||||
int start = -1;
|
||||
int end = -1;
|
||||
int nranksInGroup = 0;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
if (groups[rank] != group) continue;
|
||||
nranksInGroup++;
|
||||
if (prev[ring*nranks+rank] != -1) {
|
||||
if (start != -1) {
|
||||
WARN("Multiple starts found in group");
|
||||
}
|
||||
start = rank;
|
||||
startGroup = group;
|
||||
}
|
||||
if (next[ring*nranks+rank] != -1) {
|
||||
if (end != -1) {
|
||||
WARN("Multiple ends found in group");
|
||||
}
|
||||
end = rank;
|
||||
endGroup = group;
|
||||
}
|
||||
}
|
||||
if (nranksInGroup == 1) {
|
||||
start = end = groupFirst(nranks, groups, group, -1);
|
||||
} else {
|
||||
if (start == -1)
|
||||
start = groupFirst(nranks, groups, group, end);
|
||||
if (end == -1)
|
||||
end = groupLast(nranks, groups, group, start);
|
||||
}
|
||||
if (start == -1 || end == -1) {
|
||||
*nringsRet = ring;
|
||||
return ncclSuccess;
|
||||
}
|
||||
starts[group] = start;
|
||||
ends[group] = end;
|
||||
}
|
||||
if (endGroup == -1 || startGroup == -1) {
|
||||
startGroup = 0;
|
||||
endGroup = nGroups-1;
|
||||
// Close the loop
|
||||
next[ring*nranks+ends[endGroup]] = starts[startGroup];
|
||||
prev[ring*nranks+starts[startGroup]] = ends[endGroup];
|
||||
}
|
||||
int group = startGroup;
|
||||
for (int i=0; i<nGroups-2; i++) {
|
||||
int nextGroup = (group+1)%nGroups;
|
||||
if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
|
||||
next[ring*nranks+ends[group]] = starts[nextGroup];
|
||||
prev[ring*nranks+starts[nextGroup]] = ends[group];
|
||||
group = nextGroup;
|
||||
}
|
||||
// Connect with the last
|
||||
next[ring*nranks+ends[group]] = starts[endGroup];
|
||||
prev[ring*nranks+starts[endGroup]] = ends[group];
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
|
||||
struct shmSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -150,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
|
||||
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
struct shmRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
@@ -195,6 +120,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
send->transportResources = resources;
|
||||
send->conn.buff = resources->devRemHostMem->buff;
|
||||
send->conn.llBuff = resources->devRemHostMem->llBuff;
|
||||
send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
|
||||
send->conn.tail = &resources->devRemHostMem->tail;
|
||||
send->conn.opCountRem = &resources->devRemHostMem->opCount;
|
||||
|
||||
@@ -219,6 +145,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
|
||||
|
||||
recv->conn.buff = resources->devHostMem->buff;
|
||||
recv->conn.llBuff = resources->devHostMem->llBuff;
|
||||
recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
|
||||
recv->conn.tail = &resources->devHostMem->tail;
|
||||
recv->conn.opCountLoc = &resources->devHostMem->opCount;
|
||||
return ncclSuccess;
|
||||
@@ -243,7 +170,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
shmGetRings,
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
|
||||
};
|
||||
|
||||
Ссылка в новой задаче
Block a user