Merge remote-tracking branch 'nccl/master' into no-target-id
[ROCm/rccl commit: d469947641]
This commit is contained in:
@@ -89,10 +89,6 @@ set(CU_SOURCES
|
||||
src/collectives/device/broadcast.cu
|
||||
src/collectives/device/reduce_scatter.cu
|
||||
src/collectives/device/sendrecv.cu
|
||||
src/collectives/device/gather.cu
|
||||
src/collectives/device/scatter.cu
|
||||
src/collectives/device/all_to_all.cu
|
||||
src/collectives/device/all_to_allv.cu
|
||||
src/collectives/device/functions.cu)
|
||||
|
||||
set(CPP_SOURCES)
|
||||
|
||||
@@ -11,6 +11,7 @@ KEEP ?= 0
|
||||
DEBUG ?= 0
|
||||
TRACE ?= 0
|
||||
PROFAPI ?= 0
|
||||
NVTX ?= 1
|
||||
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
|
||||
CXXFLAGS += -DENABLE_TRACE
|
||||
endif
|
||||
|
||||
ifeq ($(NVTX), 0)
|
||||
CXXFLAGS += -DNVTX_DISABLE
|
||||
endif
|
||||
|
||||
ifneq ($(KEEP), 0)
|
||||
NVCUFLAGS += -keep
|
||||
endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 7
|
||||
NCCL_PATCH := 8
|
||||
NCCL_MINOR := 8
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
|
||||
Section: libs
|
||||
Architecture: ${pkg:Arch}
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Description: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Description: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
|
||||
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
|
||||
broadcast, and reduce-scatter.
|
||||
@@ -21,7 +21,7 @@ Package: libnccl-dev
|
||||
Section: libdevel
|
||||
Architecture: ${pkg:Arch}
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
|
||||
Description: NVIDIA Collectives Communication Library (NCCL) Development Files
|
||||
Description: NVIDIA Collective Communication Library (NCCL) Development Files
|
||||
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
|
||||
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
|
||||
broadcast, and reduce-scatter.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
Name: libnccl
|
||||
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
|
||||
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
|
||||
Group: Development/Libraries
|
||||
License: BSD
|
||||
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
|
||||
sockets.
|
||||
|
||||
%package devel
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
Group: Development/Libraries
|
||||
%description devel
|
||||
NCCL development files
|
||||
|
||||
%package static
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
Group: Development/Libraries
|
||||
%description static
|
||||
NCCL static library
|
||||
|
||||
+261
-199
@@ -13,144 +13,77 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
struct bootstrapNetComm {
|
||||
int fd;
|
||||
};
|
||||
|
||||
/* Init functions */
|
||||
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
|
||||
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
|
||||
static int bootstrapNetIfs = -1;
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union socketAddress bootstrapNetIfAddr;
|
||||
static int bootstrapNetInitDone = 0;
|
||||
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
ncclResult_t bootstrapNetInit() {
|
||||
if (bootstrapNetIfs == -1) {
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (bootstrapNetIfs == -1) {
|
||||
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
|
||||
if (bootstrapNetIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
char line[1024];
|
||||
char addrline[1024];
|
||||
line[0] = '\0';
|
||||
for (int i=0; i<bootstrapNetIfs; i++) {
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
|
||||
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
union socketAddress remoteAddr;
|
||||
if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
} else {
|
||||
int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
|
||||
if (nIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
return ncclInternalError;
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
|
||||
}
|
||||
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
|
||||
sprintf(line, " %s:", bootstrapNetIfName);
|
||||
socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
|
||||
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
|
||||
bootstrapNetInitDone = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
|
||||
NCCLCHECK(ncclCalloc(comm, 1));
|
||||
(*comm)->fd = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
|
||||
if (dev >= bootstrapNetIfs) return ncclInternalError;
|
||||
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
|
||||
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
|
||||
// if dev >= 0, listen based on dev
|
||||
if (dev >= 0) {
|
||||
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
|
||||
} else if (dev == findSubnetIf) {
|
||||
// handle stores a remote address
|
||||
// need to find a local addr that is in the same network as the remote addr
|
||||
union socketAddress localAddr;
|
||||
char ifName[MAX_IF_NAME_SIZE];
|
||||
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
// pass the local address back
|
||||
memcpy(connectAddr, &localAddr, sizeof(localAddr));
|
||||
} // Otherwise, handle stores a local address
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
|
||||
*sendComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
|
||||
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
|
||||
struct bootstrapNetComm* rComm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&rComm));
|
||||
static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
|
||||
struct sockaddr_in sockaddr;
|
||||
socklen_t socklen = sizeof(struct sockaddr_in);
|
||||
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
|
||||
*recvComm = rComm;
|
||||
SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
|
||||
if (comm) {
|
||||
close(comm->fd);
|
||||
free(comm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Additional sync functions
|
||||
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
|
||||
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
|
||||
NCCLCHECK(socketSend(comm->fd, data, size));
|
||||
static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
|
||||
NCCLCHECK(socketSend(fd, &size, sizeof(int)));
|
||||
NCCLCHECK(socketSend(fd, data, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
|
||||
static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
|
||||
int recvSize;
|
||||
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
|
||||
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
|
||||
if (recvSize > size) {
|
||||
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
|
||||
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
ncclNetHandle_t extHandleListenRoot;
|
||||
ncclNetHandle_t extHandleListen;
|
||||
union socketAddress extAddressListenRoot;
|
||||
union socketAddress extAddressListen;
|
||||
};
|
||||
|
||||
#include <sys/resource.h>
|
||||
@@ -163,27 +96,29 @@ static ncclResult_t setFilesLimit() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* listenComm) {
|
||||
static void *bootstrapRoot(void* args) {
|
||||
int listenFd = (uint64_t)args;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int nranks = 0, c = 0;
|
||||
struct extInfo info;
|
||||
ncclNetHandle_t *rankHandles = NULL;
|
||||
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
|
||||
ncclNetHandle_t zero = { 0 }; // for sanity checking
|
||||
void* tmpComm;
|
||||
ncclResult_t res;
|
||||
union socketAddress *rankAddresses = NULL;
|
||||
union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
|
||||
union socketAddress *zero = NULL;
|
||||
NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
|
||||
setFilesLimit();
|
||||
|
||||
TRACE(NCCL_INIT, "BEGIN");
|
||||
/* Receive addresses from all ranks */
|
||||
int nranks = 0, c = 0;
|
||||
do {
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
|
||||
int tmpFd;
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
|
||||
close(tmpFd);
|
||||
|
||||
if (c == 0) {
|
||||
nranks = info.nranks;
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
|
||||
}
|
||||
|
||||
if (nranks != info.nranks) {
|
||||
@@ -191,14 +126,14 @@ static void *bootstrapRoot(void* listenComm) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
|
||||
if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
|
||||
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Save the connection handle for that rank
|
||||
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
|
||||
memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
|
||||
|
||||
++c;
|
||||
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
|
||||
@@ -208,44 +143,46 @@ static void *bootstrapRoot(void* listenComm) {
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
void *tmpSendComm;
|
||||
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
|
||||
int tmpSendFd;
|
||||
NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
|
||||
close(tmpSendFd);
|
||||
}
|
||||
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
||||
|
||||
out:
|
||||
bootstrapNetCloseListen(listenComm);
|
||||
if (rankHandles) free(rankHandles);
|
||||
if (rankHandlesRoot) free(rankHandlesRoot);
|
||||
close(listenFd);
|
||||
if (rankAddresses) free(rankAddresses);
|
||||
if (rankAddressesRoot) free(rankAddressesRoot);
|
||||
if (zero) free(zero);
|
||||
|
||||
TRACE(NCCL_INIT, "DONE");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
void* listenComm;
|
||||
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
|
||||
union socketAddress* connectAddr = (union socketAddress*) id;
|
||||
int listenFd;
|
||||
NCCLCHECK(createListenSocket(&listenFd, connectAddr));
|
||||
pthread_t thread;
|
||||
pthread_create(&thread, NULL, bootstrapRoot, listenComm);
|
||||
pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
memset(id, 0, sizeof(ncclUniqueId));
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
union socketAddress* connectAddr = (union socketAddress*) id;
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
|
||||
if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
} else {
|
||||
memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(bootstrapCreateRoot(id, false));
|
||||
}
|
||||
|
||||
@@ -254,24 +191,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
|
||||
struct unexConn {
|
||||
int peer;
|
||||
void* comm;
|
||||
int fd;
|
||||
struct unexConn* next;
|
||||
};
|
||||
|
||||
struct extState {
|
||||
void* extBstrapListenComm;
|
||||
void* extBstrapRingRecvComm;
|
||||
void* extBstrapRingSendComm;
|
||||
ncclNetHandle_t* peerBstrapHandles;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int rank;
|
||||
int nranks;
|
||||
int dev;
|
||||
// Remote allocator state
|
||||
struct remAllocState {
|
||||
int cudaDev;
|
||||
int listenFd;
|
||||
int stop;
|
||||
};
|
||||
|
||||
struct extState {
|
||||
int extListenFd;
|
||||
int extRingRecvFd;
|
||||
int extRingSendFd;
|
||||
union socketAddress* peerCommAddresses;
|
||||
union socketAddress* peerAllocAddresses;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int cudaDev;
|
||||
int rank;
|
||||
int nranks;
|
||||
|
||||
// Intermediate memory allocation service
|
||||
struct remAllocState* allocState;
|
||||
pthread_t allocThread;
|
||||
};
|
||||
|
||||
#define MAX_SEGMENTS 128
|
||||
|
||||
static ncclResult_t remoteAlloc(void** ptr, int fd) {
|
||||
size_t size;
|
||||
NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
|
||||
hipIpcMemHandle_t devIpc;
|
||||
NCCLCHECK(ncclCudaCalloc((char**)ptr, size, true));
|
||||
hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
|
||||
if (res != hipSuccess) {
|
||||
WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
|
||||
hipFree(*ptr);
|
||||
CUDACHECK(res);
|
||||
}
|
||||
// The CUDA IPC
|
||||
NCCLCHECK(socketSend(fd, &devIpc, sizeof(hipIpcMemHandle_t)));
|
||||
// And the direct pointer
|
||||
NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#include <poll.h>
|
||||
|
||||
// Service thread to allocate memory for other GPUs, used as intermediate step.
|
||||
void* ncclRemoteMemAllocationService(void* args) {
|
||||
struct remAllocState* state = (struct remAllocState *) args;
|
||||
if (hipSetDevice(state->cudaDev) != hipSuccess) {
|
||||
WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
|
||||
}
|
||||
|
||||
// Prepare poll descriptor
|
||||
void* segments[MAX_SEGMENTS];
|
||||
struct pollfd pollfds[MAX_SEGMENTS+1];
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
pollfds[s].fd = -1;
|
||||
pollfds[s].events = POLLHUP;
|
||||
}
|
||||
pollfds[MAX_SEGMENTS].fd = state->listenFd;
|
||||
pollfds[MAX_SEGMENTS].events = POLLIN;
|
||||
|
||||
int nbuffers = 0;
|
||||
while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
|
||||
if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
|
||||
WARN("[Rem Allocator] Poll failed with error %d", error);
|
||||
return NULL;
|
||||
}
|
||||
if (pollfds[MAX_SEGMENTS].revents) {
|
||||
int s = 0;
|
||||
while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
|
||||
if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
|
||||
pollfds[s].fd = -1;
|
||||
} else {
|
||||
if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
|
||||
WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
|
||||
close(pollfds[s].fd);
|
||||
pollfds[s].fd = -1;
|
||||
} else {
|
||||
nbuffers++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
if (pollfds[s].revents & POLLHUP) {
|
||||
if (hipFree(segments[s]) != hipSuccess) {
|
||||
WARN("[Rem Allocator] hipFree %p failed", segments[s]);
|
||||
}
|
||||
segments[s] = NULL;
|
||||
close(pollfds[s].fd);
|
||||
pollfds[s].fd = -1;
|
||||
nbuffers--;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
if (segments[s]) hipFree(segments[s]);
|
||||
close(pollfds[s].fd);
|
||||
}
|
||||
close(state->listenFd);
|
||||
free(state);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
int fd;
|
||||
ncclResult_t res;
|
||||
*id = -1;
|
||||
NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
|
||||
NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
|
||||
NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(hipIpcMemHandle_t)), res, end);
|
||||
NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
|
||||
*id = fd;
|
||||
end:
|
||||
return res;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
|
||||
SYSCHECK(close(id), "close");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
|
||||
struct extState* state;
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
state->rank = rank;
|
||||
@@ -283,19 +331,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
|
||||
struct extInfo info = { 0 };
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
void *tmpSendComm, *tmpRecvComm;
|
||||
// Pass the remote address to listen via info
|
||||
if (idFromEnv) {
|
||||
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
|
||||
}
|
||||
// listen will return the local address via info (specify interface type 'findSubnetIf')
|
||||
state->dev = idFromEnv ? findSubnetIf : 0;
|
||||
void* extBstrapListenCommRoot;
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
|
||||
int tmpSendFd, tmpRecvFd;
|
||||
|
||||
// stagger connection times to avoid an overload of the root at very high rank counts
|
||||
int extListenFdRoot;
|
||||
memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
|
||||
NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
|
||||
|
||||
// stagger connection times to avoid an overload of the root
|
||||
if (nranks > 128) {
|
||||
long msec = rank;
|
||||
struct timespec tv;
|
||||
@@ -306,25 +350,35 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
union socketAddress* rootAddr = (union socketAddress*)id;
|
||||
NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
|
||||
close(tmpSendFd);
|
||||
|
||||
// get info on my "next" rank in the bootstrap ring from root
|
||||
ncclNetHandle_t extHandleNext;
|
||||
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
|
||||
union socketAddress extAddressNext;
|
||||
NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
|
||||
close(tmpRecvFd);
|
||||
close(extListenFdRoot);
|
||||
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
|
||||
NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
|
||||
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));
|
||||
|
||||
// AllGather all listen handlers
|
||||
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
|
||||
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
|
||||
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
|
||||
memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
|
||||
|
||||
// Create the memory allocation service
|
||||
NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
|
||||
memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(ncclCalloc(&state->allocState, 1));
|
||||
CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
|
||||
NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
|
||||
pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
@@ -348,9 +402,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
|
||||
NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
|
||||
// Recv slice from the left
|
||||
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
||||
NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
@@ -359,20 +413,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
void* tmpSendComm;
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
int tmpSendFd;
|
||||
NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
|
||||
close(tmpSendFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
|
||||
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
|
||||
// New unex
|
||||
struct unexConn* unex;
|
||||
NCCLCHECK(ncclCalloc(&unex, 1));
|
||||
unex->peer = peer;
|
||||
unex->comm = comm;
|
||||
unex->fd = fd;
|
||||
|
||||
// Enqueue
|
||||
struct unexConn* list = state->unexpectedConnections;
|
||||
@@ -385,7 +439,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* unexpectedDequeue(struct extState* state, int peer) {
|
||||
int unexpectedDequeue(struct extState* state, int peer) {
|
||||
struct unexConn* elem = state->unexpectedConnections;
|
||||
struct unexConn* prev = NULL;
|
||||
while (elem) {
|
||||
@@ -395,41 +449,41 @@ void* unexpectedDequeue(struct extState* state, int peer) {
|
||||
} else {
|
||||
prev->next = elem->next;
|
||||
}
|
||||
void* comm = elem->comm;
|
||||
int fd = elem->fd;
|
||||
free(elem);
|
||||
return comm;
|
||||
return fd;
|
||||
}
|
||||
prev = elem;
|
||||
elem = elem->next;
|
||||
}
|
||||
return NULL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
|
||||
void* tmpRecvComm;
|
||||
int tmpRecvFd;
|
||||
|
||||
// Search unexpected connections first
|
||||
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
|
||||
close(tmpRecvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Then look for new connections
|
||||
while (1) {
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
|
||||
int newPeer;
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
|
||||
if (newPeer == peer) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
|
||||
close(tmpRecvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Unexpected connection. Save for later.
|
||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
|
||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,11 +493,17 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
WARN("Unexpected connections are not empty.\n");
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
|
||||
close(state->extListenFd);
|
||||
close(state->extRingSendFd);
|
||||
close(state->extRingRecvFd);
|
||||
|
||||
free(state->peerBstrapHandles);
|
||||
state->allocState->stop = 1;
|
||||
|
||||
// Join the allocThread so we catch resource leaks as being hung here
|
||||
// pthread_join(state->allocThread, nullptr);
|
||||
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerAllocAddresses);
|
||||
free(state);
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -451,10 +511,12 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
|
||||
ncclResult_t bootstrapAbort(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
bootstrapNetCloseListen(state->extBstrapListenComm);
|
||||
bootstrapNetCloseSend(state->extBstrapRingSendComm);
|
||||
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
|
||||
free(state->peerBstrapHandles);
|
||||
close(state->extListenFd);
|
||||
close(state->extRingSendFd);
|
||||
close(state->extRingRecvFd);
|
||||
state->allocState->stop = 2;
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerAllocAddresses);
|
||||
free(state);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -26,16 +26,14 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->collectivesExtra, comm->nRanks*NCCL_MAX_OPS*4));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
if (channel->id == -1) return ncclSuccess;
|
||||
// Operation list
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectivesExtra));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectives));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
|
||||
|
||||
// Free Ring index to rank tables
|
||||
free(channel->ring.userRanks);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllGather, "AllGather",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -25,9 +25,10 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
//struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
|
||||
// sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
// ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
|
||||
//return ncclEnqueueCheck(&info);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,9 +37,10 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
|
||||
sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
|
||||
return ncclEnqueueCheck(&info);
|
||||
//struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
|
||||
// sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
// ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
|
||||
//return ncclEnqueueCheck(&info);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
|
||||
IMPL_COLL_C(AllGather);
|
||||
|
||||
@@ -9,206 +9,201 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
prims.directSend(thisInput+chunkOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
prims.directSend(thisInput+chunkOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<int PROTO, class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
template<int PROTO, class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
|
||||
IMPL_COLL_R(AllReduce);
|
||||
|
||||
Plik diff jest za duży
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
|
||||
IMPL_COLL_C(Broadcast);
|
||||
|
||||
@@ -9,177 +9,155 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __rtc64();
|
||||
#endif
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
INIT_COUNTER;
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(copySend);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
INIT_COUNTER;
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recv);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvCopySend);
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -11,122 +11,95 @@
|
||||
#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
|
||||
__device__
|
||||
inline __attribute((always_inline))
|
||||
long long int __rtc64() {
|
||||
#if __HIP__
|
||||
return (long long int) __builtin_amdgcn_s_memrealtime();
|
||||
#else
|
||||
return (long long int) __clock_u64();
|
||||
#endif
|
||||
}
|
||||
#define COLL_UNROLL 2
|
||||
#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
|
||||
|
||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if abort == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define exitIfAbortBarrier(abort, abortCount) \
|
||||
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
|
||||
__syncthreads(); \
|
||||
if (LOAD(abortCount)) { /*asm volatile ("s_endpgm");*/ return false; }
|
||||
#define __syncwarp()
|
||||
#else
|
||||
static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
uint32_t popc;
|
||||
asm ("{");
|
||||
asm volatile (" .reg .pred barr_pred;");
|
||||
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
|
||||
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
|
||||
asm ("}");
|
||||
if (popc) { asm volatile ("exit;"); }
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, type) \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64), \
|
||||
NCCL_FUNC4(coll, op, b16)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double), \
|
||||
NCCL_FUNC4(func, redop, rccl_bfloat16)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum ), \
|
||||
NCCL_FUNCS3A(func, Prod), \
|
||||
NCCL_FUNCS3A(func, Max ), \
|
||||
NCCL_FUNCS3A(func, Min )
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce), \
|
||||
NCCL_COLL_NAME(ncclGather, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclScatter, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclAllToAllv, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
|
||||
NCCL_FUNCS2B(Broadcast), \
|
||||
NCCL_FUNCS2A(Reduce), \
|
||||
NCCL_FUNCS2B(AllGather), \
|
||||
NCCL_FUNCS2A(ReduceScatter), \
|
||||
NCCL_FUNCS2A(AllReduce), \
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
|
||||
using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
|
||||
|
||||
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce),
|
||||
NCCL_COLL_NAME(ncclGather, copy, i8),
|
||||
NCCL_COLL_NAME(ncclScatter, copy, i8),
|
||||
NCCL_COLL_NAME(ncclAllToAll, copy, i8),
|
||||
NCCL_COLL_NAME(ncclAllToAllv, copy, i8),
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8)
|
||||
NCCL_FUNCS2B(Broadcast),
|
||||
NCCL_FUNCS2A(Reduce),
|
||||
NCCL_FUNCS2B(AllGather),
|
||||
NCCL_FUNCS2A(ReduceScatter),
|
||||
NCCL_FUNCS2A(AllReduce),
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
#endif
|
||||
};
|
||||
|
||||
template<unsigned short f, unsigned short l>
|
||||
struct Caller {
|
||||
static __device__ __host__
|
||||
void call(ncclColl* const c) noexcept
|
||||
void call(struct ncclWorkElem* const c) noexcept
|
||||
{
|
||||
constexpr unsigned short m = f + (l - f) / 2;
|
||||
|
||||
@@ -137,78 +110,72 @@ struct Caller {
|
||||
template<unsigned short f>
|
||||
struct Caller<f, f + 1>{
|
||||
static __device__ __host__
|
||||
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
|
||||
void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
|
||||
};
|
||||
|
||||
inline
|
||||
__device__
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
|
||||
if (c->funcIndex < 360) {
|
||||
if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
|
||||
else ncclBroadcastCollNet_copy_i8(&c->args);
|
||||
if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
|
||||
else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
else if (c->funcIndex < 720) Caller<360, 720>::call(c);
|
||||
else if (c->funcIndex < 1080) {
|
||||
if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
|
||||
else ncclAllGatherCollNet_copy_i8(&c->args);
|
||||
if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
|
||||
else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
|
||||
else if (c->funcIndex == 1800) {
|
||||
ncclGather_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1801) {
|
||||
ncclScatter_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1802) {
|
||||
ncclAllToAll_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1803) {
|
||||
ncclAllToAllv_copy_i8(&c->args);
|
||||
}
|
||||
else ncclSendRecv_copy_i8(&c->args);
|
||||
else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
|
||||
int* d = (int*)dst;
|
||||
int* s = (int*)src;
|
||||
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
|
||||
}
|
||||
|
||||
static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
int abort = tid == 0 ? *(comm->abortFlag) : 0;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
|
||||
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
|
||||
__syncthreads();
|
||||
if (tid == 0) hostColl->active = 0;
|
||||
load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
if (tid == 0) hostWork->elems[0].active = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction {
|
||||
public:
|
||||
__device__ void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(fIdx) \
|
||||
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
comm->collTrace[pos].timeStamp = __rtc64(); \
|
||||
comm->collTrace[pos].opCount = localColl.args.opCount; \
|
||||
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
comm->collTrace[pos].opCount = w->opCount; \
|
||||
comm->collTrace[pos].bid = bid; \
|
||||
comm->collTrace[pos].funcIndex = fIdx;
|
||||
#define traceKernelLaunch(fIdx) { \
|
||||
traceColl(fIdx); \
|
||||
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
|
||||
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
|
||||
}
|
||||
#define traceCollEnd(fIdx) { \
|
||||
traceColl(fIdx); \
|
||||
@@ -218,124 +185,159 @@ static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* ho
|
||||
traceColl(fIdx); \
|
||||
comm->collTrace[pos].type = ncclCollTraceAbortType; \
|
||||
}
|
||||
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
comm->collTrace[pos].bid = blockIdx.x; \
|
||||
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
comm->collTrace[pos].funcIndex = data2; \
|
||||
comm->collTrace[pos].data_0 = data4; \
|
||||
comm->collTrace[pos].opCount = data8_0; \
|
||||
comm->collTrace[pos].data_1 = data8_1; \
|
||||
comm->collTrace[pos].type = ncclCollTraceDataType; \
|
||||
}
|
||||
#else
|
||||
#define traceKernelLaunch()
|
||||
#define traceCollEnd()
|
||||
#define traceAbort()
|
||||
#define traceKernelLaunch(fIdx)
|
||||
#define traceCollEnd(fIdx)
|
||||
#define traceAbort(fIdx)
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
|
||||
extern __device__ volatile uint64_t* ncclShmem;
|
||||
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
|
||||
struct ncclShmemPtrs {
|
||||
void* srcs[NCCL_MAX_DEV_ARITY+1];
|
||||
void* dsts[NCCL_MAX_DEV_ARITY+1];
|
||||
uint64_t barrier;
|
||||
uint64_t barrier_next[MAXWARPS];
|
||||
};
|
||||
|
||||
struct ncclShmemData {
|
||||
union {
|
||||
#ifdef ENABLE_LL128
|
||||
#define ALLOCATE_SHMEM \
|
||||
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
|
||||
ncclShmem = shmem; \
|
||||
__shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE];
|
||||
volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
|
||||
#else
|
||||
#define ALLOCATE_SHMEM \
|
||||
uint32_t* sync = 0;
|
||||
volatile uint64_t* data;
|
||||
#endif
|
||||
struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
|
||||
};
|
||||
uint32_t sync[MAXWARPS];
|
||||
struct ncclWork localWork;
|
||||
};
|
||||
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
|
||||
extern __device__ struct ncclShmemData *ncclShmem;
|
||||
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX, bool COLLTRACE>
|
||||
__device__ void ncclKernel(struct ncclWorkElem first) {
|
||||
int tid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
__shared__ struct ncclShmemData shmem;
|
||||
ncclShmem = &shmem;
|
||||
__shared__ uint32_t abortCount;
|
||||
if (tid == 0) {
|
||||
abortCount = 0;
|
||||
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
|
||||
shmem.ptrs[i].barrier = 0;
|
||||
for (auto j = 0; j < MAXWARPS; j++) shmem.ptrs[i].barrier_next[j] = 0;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
|
||||
|
||||
struct ncclDevComm* comm = first.comm;
|
||||
struct ncclChannel* channel = comm->channels+bid;
|
||||
struct ncclWorkElem* w = NULL;
|
||||
uint16_t index = first.index;
|
||||
bool firstLaunch = true;
|
||||
|
||||
if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
|
||||
|
||||
while (1) {
|
||||
if (w == NULL) {
|
||||
w = shmem.localWork.elems;
|
||||
if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
|
||||
if (COLLTRACE && tid == 0) traceAbort(-1);
|
||||
return;
|
||||
}
|
||||
if (COLLTRACE && tid == 0) {
|
||||
if (firstLaunch) traceKernelLaunch(w->funcIndex);
|
||||
if (!firstLaunch) traceCollEnd(w->funcIndex);
|
||||
firstLaunch = false;
|
||||
}
|
||||
} else {
|
||||
if (COLLTRACE && tid == 0) {
|
||||
traceKernelLaunch(w->funcIndex);
|
||||
firstLaunch = false;
|
||||
}
|
||||
}
|
||||
if (tid < w->nThreads) {
|
||||
if (w->funcIndex == FINDEX) {
|
||||
f.run(w);
|
||||
} else {
|
||||
NCCL_CALL_FUNCTIONS(w);
|
||||
}
|
||||
}
|
||||
index = (index+1) % NCCL_MAX_OPS;
|
||||
if (w->active == 2) {
|
||||
if (COLLTRACE && tid == 0) traceCollEnd(-1);
|
||||
return;
|
||||
}
|
||||
w = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Kernels with the first operation inlined */
|
||||
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
|
||||
int tid = threadIdx.x; \
|
||||
int bid = blockIdx.x; \
|
||||
ALLOCATE_SHMEM; \
|
||||
__shared__ struct ncclColl localColl; \
|
||||
__shared__ uint32_t abortCount; \
|
||||
__shared__ uint64_t barrier[MAXBARRIERS]; \
|
||||
__shared__ uint64_t barrier_next[MAXBARRIERS*MAXWARPS]; \
|
||||
if (tid == 0) abortCount = 0; \
|
||||
__syncthreads(); \
|
||||
\
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
if (tid == 0) { \
|
||||
channel->sync = sync; \
|
||||
channel->barrier = barrier; \
|
||||
channel->barrier_next = barrier_next; \
|
||||
for (auto i = 0; i < MAXBARRIERS; i++) barrier[i] = 0; \
|
||||
for (auto i = 0; i < MAXBARRIERS*MAXWARPS; i++) barrier_next[i] = 0; \
|
||||
} \
|
||||
if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
return; \
|
||||
} \
|
||||
if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
|
||||
while (1) { \
|
||||
if (tid < localColl.args.common.nThreads) { \
|
||||
if (localColl.funcIndex == fIndex) { \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
|
||||
} else { \
|
||||
NCCL_CALL_FUNCTIONS(&localColl); \
|
||||
} \
|
||||
} \
|
||||
int nextIndex = localColl.nextIndex; \
|
||||
if (tid == 0) channel->collFifoHead = nextIndex; \
|
||||
\
|
||||
if (localColl.active == 2) { \
|
||||
if (tid == 0) traceCollEnd(-1); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Load next collective operation*/ \
|
||||
if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
break; \
|
||||
} \
|
||||
if (tid == 0) traceCollEnd(localColl.funcIndex); \
|
||||
} \
|
||||
__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
|
||||
if (first.comm->collTraceThread) \
|
||||
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, true>(first); \
|
||||
else \
|
||||
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, false>(first); \
|
||||
}
|
||||
|
||||
#define IMPL_COLL_KERN_sum(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_copy(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_prod(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_min(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_max(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
// Examples : AllReduce, RING, LL, Sum, uint8
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
|
||||
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
|
||||
auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
|
||||
f.run(args); \
|
||||
}
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
|
||||
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
#define IMPL_COLL4(func, algo, redop, type, ncclType) \
|
||||
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
|
||||
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
|
||||
|
||||
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
|
||||
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
|
||||
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
|
||||
IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
|
||||
#define IMPL_COLL3(func, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, TREE, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, RING, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
|
||||
|
||||
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
|
||||
#define IMPL_COLL2(func, redop) \
|
||||
IMPL_COLL3(func, redop, int8_t, ncclInt8) \
|
||||
IMPL_COLL3(func, redop, uint8_t, ncclUint8) \
|
||||
IMPL_COLL3(func, redop, int32_t, ncclInt32) \
|
||||
IMPL_COLL3(func, redop, uint32_t, ncclUint32) \
|
||||
IMPL_COLL3(func, redop, int64_t, ncclInt64) \
|
||||
IMPL_COLL3(func, redop, uint64_t, ncclUint64) \
|
||||
IMPL_COLL3(func, redop, half, ncclFloat16) \
|
||||
IMPL_COLL3(func, redop, float, ncclFloat32) \
|
||||
IMPL_COLL3(func, redop, double, ncclFloat64) \
|
||||
IMPL_COLL3(func, redop, rccl_bfloat16, ncclBfloat16)
|
||||
|
||||
// Reduction define all functions
|
||||
#define IMPL_COLL_R(collf, colln) \
|
||||
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); \
|
||||
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
|
||||
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); \
|
||||
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
|
||||
#define IMPL_COLL_R(func) \
|
||||
IMPL_COLL2(func, Sum) \
|
||||
IMPL_COLL2(func, Prod) \
|
||||
IMPL_COLL2(func, Min) \
|
||||
IMPL_COLL2(func, Max)
|
||||
|
||||
// Copy primitives only define one
|
||||
#define IMPL_COLL_C(collf, colln) \
|
||||
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
|
||||
// Copy primitives only define one function for copy
|
||||
#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
|
||||
|
||||
#define COLL_UNROLL 2
|
||||
// Point-to-point primitives only have one function/kernel.
|
||||
#define IMPL_COLL_P(func) \
|
||||
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
|
||||
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -282,28 +282,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
const int offset, const int N) {
|
||||
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
|
||||
T val = vFetch(srcs[0]+idx);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
#pragma unroll 1
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
|
||||
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
int offset = w * UNROLL * WARP_SIZE + t;
|
||||
|
||||
const T* srcs[MAXSRCS];
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
|
||||
T* dsts[MAXDSTS];
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
|
||||
|
||||
while (offset < Nelem) {
|
||||
T vals[UNROLL];
|
||||
// Load and reduce
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
|
||||
for (int i=1; i<MINSRCS; i++) {
|
||||
T vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i=MINSRCS; i<MAXSRCS; i++) {
|
||||
if (i<nsrcs) {
|
||||
T vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
|
||||
}
|
||||
}
|
||||
|
||||
// Store
|
||||
#pragma unroll
|
||||
for (int i = 0; i < MINDSTS; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i=MINDSTS; i<MAXDSTS; i++) {
|
||||
if (i<ndsts) {
|
||||
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
|
||||
offset += inc;
|
||||
}
|
||||
}
|
||||
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
|
||||
const int elemOffset, const int Npack) {
|
||||
__device__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
|
||||
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
int offset = w * UNROLL * WARP_SIZE + t;
|
||||
|
||||
@@ -334,8 +363,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
for (int i=MINDSTS; i<MAXDSTS; i++) {
|
||||
if (i<ndsts) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
|
||||
@@ -343,85 +374,73 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
template <typename T>
|
||||
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
|
||||
#else
|
||||
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
|
||||
|
||||
#define PACKELEMS (sizeof(Pack128) / sizeof(T))
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// Multiply UNROLL by 2 if single source/single destination
|
||||
#define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
|
||||
#endif
|
||||
|
||||
// Try to limit consecutive load/stores to 8.
|
||||
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
|
||||
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
|
||||
|
||||
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T** srcs, int ndsts, T** dsts,
|
||||
int N) {
|
||||
int Nrem = N;
|
||||
if (Nrem <= 0) return;
|
||||
|
||||
int alignDiff = 0;
|
||||
int align = ptrAlign128(srcs[0]);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int Npreamble = alignDiff ? Nrem :
|
||||
N < alignof(int32_t) ? N :
|
||||
(alignof(int32_t) - align) % alignof(int32_t);
|
||||
#else
|
||||
int Npreamble = alignDiff ? Nrem :
|
||||
N < alignof(Pack128) ? N :
|
||||
(alignof(Pack128) - align) % alignof(Pack128);
|
||||
#endif
|
||||
|
||||
// stage 1: preamble: handle any elements up to the point of everything coming
|
||||
// into alignment
|
||||
if (Npreamble) {
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
|
||||
Nrem -= Npreamble;
|
||||
if (Nrem == 0) return;
|
||||
}
|
||||
int offset = Npreamble;
|
||||
|
||||
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
|
||||
// assuming the pointers we have are all 128-bit alignable.
|
||||
int w = tid / WARP_SIZE; // Warp number
|
||||
int nw = nthreads / WARP_SIZE; // Number of warps
|
||||
int t = tid % WARP_SIZE; // Thread (inside the warp)
|
||||
|
||||
const int packFactor = sizeof(Pack128) / sizeof(T);
|
||||
// Check that all is 16B aligned. If not don't use 16B load/stores.
|
||||
int align = 0;
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
|
||||
|
||||
// stage 2a: main loop
|
||||
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
|
||||
* (AUTOUNROLL * WARP_SIZE); // round down
|
||||
int Nelem2a = Npack2a * packFactor;
|
||||
int offset = 0;
|
||||
if (align == 0) {
|
||||
// fast path: use 128b loads/stores to do the bulk of the work,
|
||||
// assuming the pointers we have are all 128-bit aligned.
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
|
||||
// main loop
|
||||
int Npack = (Nrem / (PACKELEMS*AUTOUNROLL*WARP_SIZE)) * (AUTOUNROLL*WARP_SIZE); // round down
|
||||
int Nelem = Npack * PACKELEMS;
|
||||
|
||||
Nrem -= Nelem2a;
|
||||
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem;
|
||||
|
||||
// slightly less optimized for section when we don't have full unrolling
|
||||
Npack = Nrem / PACKELEMS;
|
||||
Nelem = Npack * PACKELEMS;
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem;
|
||||
}
|
||||
|
||||
// unrolled, by-type (mostly for unaligned buffers)
|
||||
int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
|
||||
|
||||
ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem2a;
|
||||
offset += Nelem;
|
||||
|
||||
// stage 2b: slightly less optimized for section when we don't have full
|
||||
// unrolling
|
||||
|
||||
int Npack2b = Nrem / packFactor;
|
||||
int Nelem2b = Npack2b * packFactor;
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
|
||||
|
||||
Nrem -= Nelem2b;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem2b;
|
||||
|
||||
// stage 2c: tail
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
|
||||
// no unroll, by type. Should finish what's remaining.
|
||||
ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
|
||||
@@ -9,62 +9,62 @@
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
__device__ volatile uint64_t* ncclShmem;
|
||||
__device__ struct ncclShmemData* ncclShmem;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL128, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, type) \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum ), \
|
||||
NCCL_FUNCS3A(func, Prod), \
|
||||
NCCL_FUNCS3A(func, Max ), \
|
||||
NCCL_FUNCS3A(func, Min )
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
|
||||
NCCL_FUNCS2B(Broadcast), \
|
||||
NCCL_FUNCS2A(Reduce), \
|
||||
NCCL_FUNCS2B(AllGather), \
|
||||
NCCL_FUNCS2A(ReduceScatter), \
|
||||
NCCL_FUNCS2A(AllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
@@ -72,12 +72,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_FUNCS2B(Broadcast),
|
||||
NCCL_FUNCS2A(Reduce),
|
||||
NCCL_FUNCS2B(AllGather),
|
||||
NCCL_FUNCS2A(ReduceScatter),
|
||||
NCCL_FUNCS2A(AllReduce)
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -9,29 +9,9 @@
|
||||
#define OP128_H_
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(ptr);
|
||||
v1=LOAD(ptr+1);
|
||||
}
|
||||
|
||||
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
|
||||
STORE(ptr, v0);
|
||||
STORE(ptr+1, v1);
|
||||
}
|
||||
|
||||
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
|
||||
return (uint64_t*)shmemGenericPtr;
|
||||
}
|
||||
|
||||
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(shmemAsmPtr);
|
||||
v1=LOAD(shmemAsmPtr+1);
|
||||
}
|
||||
|
||||
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
|
||||
STORE(shmemAsmPtr, v0);
|
||||
STORE(shmemAsmPtr+1, v1);
|
||||
}
|
||||
#else
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
|
||||
|
||||
@@ -32,87 +32,79 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define barrier_by_id(id) do { \
|
||||
#define barrier_by_group() do { \
|
||||
const int w = threadIdx.x/WARP_SIZE; \
|
||||
barrier_next[id*MAXWARPS+w] += nthreads/WARP_SIZE; \
|
||||
__atomic_fetch_add(barriers+id, 1, __ATOMIC_SEQ_CST); \
|
||||
while (LOAD(barriers+id) < barrier_next[id*MAXWARPS+w]) /* spin */; \
|
||||
const int wid = threadIdx.x%WARP_SIZE; \
|
||||
if (wid == 0) { \
|
||||
barrier_next[w] += nthreads/WARP_SIZE; \
|
||||
__atomic_fetch_add(barriers, 1, __ATOMIC_SEQ_CST); \
|
||||
while (LOAD(barriers) < barrier_next[w]) /* spin */; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define ROLE_SRC 0x01
|
||||
#define ROLE_DST 0x02
|
||||
#define ROLE_WAIT_RECV 0x04
|
||||
#define ROLE_WAIT_SEND 0x08
|
||||
#define ROLE_POST_SEND 0x10
|
||||
#define ROLE_POST_RECV 0x20
|
||||
|
||||
// Implementation of primitive types
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
|
||||
class ncclPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
int nthreads;
|
||||
int nworkers;
|
||||
const int stepSize;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn = NULL;
|
||||
volatile uint64_t* recvConnHeadPtr = NULL;
|
||||
uint64_t recvConnHead;
|
||||
volatile uint64_t* recvConnTailPtr = NULL;
|
||||
uint64_t recvConnTail;
|
||||
uint64_t recvConnTailCache; // Cache last seen value
|
||||
struct ncclConnInfo* conn = NULL;
|
||||
volatile int* connSizesFifoPtr = NULL;
|
||||
void** connPtrsFifoPtr = NULL;
|
||||
volatile uint64_t* connHeadPtr = NULL;
|
||||
volatile uint64_t* connTailPtr = NULL;
|
||||
uint64_t connTailCache; // Cache last seen value
|
||||
uint64_t connHeadCache; // Cache last seen value
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
const T* recvDirectBuff[NRECV];
|
||||
T* sendDirectBuff[NSEND];
|
||||
#endif
|
||||
const T* recvBuff[NRECV];
|
||||
T* sendBuff[NSEND];
|
||||
int index; // Peer index I'm responsible for
|
||||
int peer = -1;
|
||||
int role = 0;
|
||||
int group;
|
||||
uint64_t step;
|
||||
T* direct = NULL;
|
||||
T* buff;
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
|
||||
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
const T** srcs;
|
||||
T** dsts;
|
||||
|
||||
uint64_t* barriers;
|
||||
uint64_t* barrier_next;
|
||||
|
||||
// Don't use barrier 0 as it's used by the final sync
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (wid == 0) {
|
||||
if (NRECV < NSEND) barrier_by_id(0);
|
||||
else barrier_by_id(1);
|
||||
}
|
||||
if (nthreads == WARP_SIZE) __syncwarp();
|
||||
else barrier_by_group();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
|
||||
} else {
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
|
||||
}
|
||||
if (nthreads == WARP_SIZE) __syncwarp();
|
||||
else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void subBarrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
barrier();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
|
||||
} else {
|
||||
asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
|
||||
}
|
||||
if (nworkers == nthreads) barrier();
|
||||
else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int i, int send) {
|
||||
inline __device__ int checkAbort() {
|
||||
spins++;
|
||||
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
@@ -121,90 +113,54 @@ class ncclPrimitives {
|
||||
return abort;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
spins = 0;
|
||||
if (sendConnHeadPtr) {
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
|
||||
}
|
||||
sendConnHead += SLICESTEPS;
|
||||
}
|
||||
template <int DIRECTPTR>
|
||||
inline __device__ T* directPtr(ssize_t directOffset) {
|
||||
return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
|
||||
inline __device__ void waitRecv() {
|
||||
template <int DST, int DIRECTSEND>
|
||||
inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
|
||||
spins = 0;
|
||||
if (recvConnTailPtr) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __rtc64();
|
||||
#endif
|
||||
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
if (checkAbort(wid, 0)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
recvConnTail += SLICESTEPS;
|
||||
while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
|
||||
connHeadCache = LOAD(connHeadPtr);
|
||||
if (checkAbort()) break;
|
||||
}
|
||||
if (connSizesFifoPtr) {
|
||||
STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
|
||||
}
|
||||
|
||||
if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
|
||||
else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
|
||||
step += SLICESTEPS;
|
||||
}
|
||||
|
||||
inline __device__ void incRecv(int i) {
|
||||
recvStep[i] += SLICESTEPS;
|
||||
template <int SRC, int DIRECTRECV>
|
||||
inline __device__ void waitRecv(ssize_t directOffset) {
|
||||
spins = 0;
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
while (connTailCache < step + SLICESTEPS) {
|
||||
connTailCache = LOAD(connTailPtr);
|
||||
if (checkAbort()) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
|
||||
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
|
||||
step += SLICESTEPS;
|
||||
}
|
||||
|
||||
inline __device__ void postRecv() {
|
||||
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
|
||||
STORE(connHeadPtr, step += SLICESTEPS);
|
||||
}
|
||||
|
||||
inline __device__ void incSend(int i) {
|
||||
sendStep[i] += SLICESTEPS;
|
||||
}
|
||||
inline __device__ void postSend() {
|
||||
if (sendConnTailPtr) {
|
||||
if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
|
||||
STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
|
||||
}
|
||||
if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
|
||||
STORE(connTailPtr, step += SLICESTEPS);
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
|
||||
#else
|
||||
return recvPtr(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
|
||||
#else
|
||||
return sendPtr(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
|
||||
inline __device__ void
|
||||
GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
|
||||
@@ -212,148 +168,126 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
int sliceSize = stepSize*SLICESTEPS;
|
||||
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
|
||||
|
||||
const T* srcs[RECV*NRECV+SRC];
|
||||
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
|
||||
if (RECV) {
|
||||
if (SRC) srcs[1] = recvPtr(0);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
|
||||
}
|
||||
|
||||
T* dsts[SEND*NSEND+DST];
|
||||
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
if (SEND) {
|
||||
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
|
||||
int realSize = max(0, min(dataSize, nelem-offset));
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __rtc64();
|
||||
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
if (SEND) waitSend(realSize*sizeof(T));
|
||||
if (RECV) waitRecv();
|
||||
if (realSize > 0) {
|
||||
barrier();
|
||||
if (tid < nworkers) {
|
||||
if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
|
||||
if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
|
||||
if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
|
||||
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
|
||||
if (realSize > 0) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
if (DIRECTRECV && recvDirectBuff[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (SEND) {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
|
||||
}
|
||||
} else {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
subBarrier();
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
}
|
||||
#else
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
#endif
|
||||
}
|
||||
barrier();
|
||||
FOR_SEND(incSend);
|
||||
FOR_RECV(incRecv);
|
||||
if (tid >= nthreads-WARP_SIZE) {
|
||||
if (SEND) {
|
||||
if (realSize > 0 && wid == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
postSend();
|
||||
}
|
||||
if (RECV) postRecv();
|
||||
}
|
||||
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
|
||||
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
|
||||
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
|
||||
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
|
||||
if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
if (SEND && (role & ROLE_POST_SEND)) postSend();
|
||||
if (RECV && (role & ROLE_POST_RECV)) postRecv();
|
||||
offset += realSize;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
recvDirectBuff[i] = NULL;
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
recvDirectBuff[i] = directBuff;
|
||||
if (tid == 0) STORE(conn->ptrExchange, directBuff);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) recvConn = conn;
|
||||
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvSync() {
|
||||
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
|
||||
recvConnTailPtr = LOAD(&recvConn->tail);
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
recvConnHeadPtr = LOAD(&recvConn->head);
|
||||
// Return credits in case we rounded up.
|
||||
STORE(recvConnHeadPtr, recvConnHead);
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
|
||||
conn = &channel->devPeers[peer].recv.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_RECV) {
|
||||
connHeadPtr = conn->head;
|
||||
// Return credits in case we rounded up.
|
||||
STORE(connHeadPtr, step);
|
||||
}
|
||||
if (role & ROLE_WAIT_RECV) {
|
||||
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
|
||||
// direct = directBuff;
|
||||
// *conn->ptrExchange = directBuff;
|
||||
//}
|
||||
connTailPtr = conn->tail;
|
||||
connTailCache = LOAD(connTailPtr);
|
||||
connPtrsFifoPtr = conn->ptrsFifo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
sendDirectBuff[i] = NULL;
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
void* volatile* ptr = LOAD(&conn->ptrExchange);
|
||||
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
|
||||
barrier();
|
||||
if (tid == 0) STORE(ptr, NULL);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) sendConn = conn;
|
||||
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
|
||||
nsend++;
|
||||
}
|
||||
__device__ __forceinline__ void loadSendSync() {
|
||||
if (tid < nsend) {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nsend) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
|
||||
conn = &channel->devPeers[peer].send.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_SEND) {
|
||||
connTailPtr = conn->tail;
|
||||
}
|
||||
if (role & ROLE_WAIT_SEND) {
|
||||
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
|
||||
// void* volatile* ptr = conn->ptrExchange;
|
||||
// while ((direct = (T*)(*ptr)) == NULL);
|
||||
// *ptr = NULL;
|
||||
//}
|
||||
connHeadPtr = conn->head;
|
||||
connHeadCache = LOAD(connHeadPtr);
|
||||
connSizesFifoPtr = conn->sizesFifo;
|
||||
connPtrsFifoPtr = conn->ptrsFifo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
STORE(&recvConn->step, recvConnHead);
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveSendSync() {
|
||||
if (tid < nsend) {
|
||||
STORE(&sendConn->step, sendConnHead);
|
||||
__device__ __forceinline__ void saveSync() {
|
||||
if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
|
||||
conn->step = step;
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
|
||||
barriers = channel->barrier;
|
||||
barrier_next = channel->barrier_next;
|
||||
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
|
||||
nthreads = nworkers;
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
|
||||
// nthreads += postThreads;
|
||||
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
|
||||
for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
|
||||
|
||||
#define SYNC_GROUP 8
|
||||
static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
|
||||
|
||||
int g = tid / SYNC_GROUP;
|
||||
int ng = nthreads / SYNC_GROUP;
|
||||
index = tid % SYNC_GROUP;
|
||||
|
||||
if (g == 0) {
|
||||
if (index < nrecv) role |= ROLE_WAIT_RECV;
|
||||
if (index == nrecv) role |= ROLE_SRC;
|
||||
} else if (g == 1) {
|
||||
if (index < nsend) role |= ROLE_WAIT_SEND;
|
||||
if (index == nsend) role |= ROLE_DST;
|
||||
} else if (g == ng - 2) {
|
||||
if (index < nrecv) role |= ROLE_POST_RECV;
|
||||
} else if (g == ng - 1) {
|
||||
if (index < nsend) role |= ROLE_POST_SEND;
|
||||
}
|
||||
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
|
||||
|
||||
loadRecvConn(channel, directBuff);
|
||||
loadSendConn(channel);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
@@ -414,8 +348,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
|
||||
__device__ __forceinline__ ~ncclPrimitives() {
|
||||
// Save steps for the next operation
|
||||
saveRecvSync();
|
||||
saveSendSync();
|
||||
saveSync();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -424,10 +357,10 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define INIT_COUNTER \
|
||||
if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
|
||||
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
|
||||
|
||||
#define ACCUMULATE_COUNTER(prim) \
|
||||
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \
|
||||
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
|
||||
+ ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
|
||||
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
|
||||
#else
|
||||
|
||||
@@ -205,7 +205,7 @@ class ncclLLPrimitives {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -118,9 +118,14 @@ class ncclLL128Primitives {
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) {
|
||||
uint64_t v0, v1;
|
||||
load128(src64Ptr+u*WARP_SIZE, v0, v1);
|
||||
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
//load128(src64Ptr+u*WARP_SIZE, v0, v1);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(src64Ptr+u*WARP_SIZE));
|
||||
//storeShmem128(shmemAsmPtr+u*WARP_SIZE, i2[0], i2[1]);
|
||||
*(shmemAsmPtr+u*WARP_SIZE) = i2[0];
|
||||
*(shmemAsmPtr+u*WARP_SIZE+1) = i2[1];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -135,15 +140,24 @@ class ncclLL128Primitives {
|
||||
|
||||
template <int ELEMS_PER_THREAD>
|
||||
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
|
||||
uint64_t v[ELEMS_PER_THREAD];
|
||||
using Velem = uint64_t __attribute__((ext_vector_type(ELEMS_PER_THREAD)));
|
||||
Velem v;
|
||||
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
v[u] = *(shmemAsmPtr+u*WARP_SIZE);
|
||||
v[u+1] = *(shmemAsmPtr+u*WARP_SIZE+1);
|
||||
//loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
//if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = v[u+1];//
|
||||
if (u*WARP_SIZE < maxOffset) asm volatile ("flat_store_dwordx4 %0, %1\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(dst64Ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,45 +190,52 @@ class ncclLL128Primitives {
|
||||
uint64_t flag = recvFlag(0);
|
||||
uint64_t* ptr = recvPtr(0)+ll128Offset;
|
||||
bool needReload;
|
||||
uint64_t v0, v1;
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (i2[1] != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(0, 0) == 0);
|
||||
} while (LOAD(sync) && checkAbort(0, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
|
||||
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = SRC ? MULTI<FUNC, T>()(i2[0], v[u]) : i2[0];
|
||||
v[u+1] = SRC ? MULTI<FUNC, T>()(i2[1], v[u+1]) : i2[1];
|
||||
}
|
||||
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
uint64_t flag = recvFlag(i);
|
||||
uint64_t* ptr = recvPtr(i)+ll128Offset;
|
||||
uint64_t v0, v1;
|
||||
Vec i2;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
needReload = 0;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (i2[1] != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(i, 0) == 0);
|
||||
} while (LOAD(sync) && checkAbort(i, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = MULTI<FUNC, T>()(v0, v[u]);
|
||||
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = MULTI<FUNC, T>()(i2[0], v[u]);
|
||||
v[u+1] = MULTI<FUNC, T>()(i2[1], v[u+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,18 +244,30 @@ class ncclLL128Primitives {
|
||||
/************************ Send **************************/
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) {
|
||||
int flag = sendFlag(i);
|
||||
uint64_t flag = sendFlag(i);
|
||||
uint64_t* ptr = sendPtr(i)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = flagThread ? flag : v[u+1];//
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
int flag = sendFlag(0);
|
||||
uint64_t flag = sendFlag(0);
|
||||
uint64_t* ptr = sendPtr(0)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = flagThread ? flag : v[u+1];//
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
/********************** End Send ************************/
|
||||
@@ -279,7 +312,7 @@ class ncclLL128Primitives {
|
||||
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
|
||||
if (SRC) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)srcPtr)&0xf) == 0) {
|
||||
if ((((uint64_t)srcPtr)&0x3) == 0) {
|
||||
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
@@ -290,7 +323,7 @@ class ncclLL128Primitives {
|
||||
__syncwarp();
|
||||
if (DST) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)dstPtr)&0xf) == 0) {
|
||||
if ((((uint64_t)dstPtr)&0x3) == 0) {
|
||||
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
@@ -330,10 +363,10 @@ class ncclLL128Primitives {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
|
||||
if (sendConn->fifo) {
|
||||
if (sendConn->sizesFifo) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
sendConnTail = LOAD(&sendConn->step);
|
||||
}
|
||||
@@ -357,12 +390,7 @@ class ncclLL128Primitives {
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
|
||||
// for __any_sync
|
||||
if (NSEND > NRECV)
|
||||
sync = channel->sync + 2 + tid/WARP_SIZE;
|
||||
else
|
||||
sync = channel->sync + tid/WARP_SIZE;
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid), sync(ncclShmem->sync+warp) {
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclReduce, ncclCollReduce);
|
||||
IMPL_COLL_R(Reduce);
|
||||
|
||||
@@ -9,151 +9,145 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
|
||||
IMPL_COLL_R(ReduceScatter);
|
||||
|
||||
@@ -9,195 +9,189 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.send(thisInput+offset, nelem);
|
||||
prims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,5 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
|
||||
IMPL_COLL_P(SendRecv);
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,74 +8,85 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->p2p.nThreads;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* firstArgs) {
|
||||
struct ncclWorkElem* args = firstArgs;
|
||||
int tid = threadIdx.x;
|
||||
int group = 0;
|
||||
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
|
||||
int nThreadsSegment = args->p2p.nThreads;
|
||||
if (nThreadsSegment == 0) return; // Nothing else to do
|
||||
int groupRecv = group;
|
||||
group += 1;
|
||||
int groupSend = group;
|
||||
group += 1;
|
||||
if (tid < nThreadsSegment) {
|
||||
const int nThreads = nThreadsSegment;
|
||||
|
||||
// Compute pointers
|
||||
const T* sendbuff = (const T*)args->sendbuff;
|
||||
T* recvbuff = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T* sendbuff = (const T*)args->sendbuff;
|
||||
T* recvbuff = (T*)args->recvbuff;
|
||||
const ssize_t sendCount = args->p2p.sendCount;
|
||||
const ssize_t recvCount = args->p2p.recvCount;
|
||||
|
||||
if (args->p2p.delta < 0 ) return; // No-op
|
||||
const int delta = args->p2p.delta;
|
||||
if (delta == 0) {
|
||||
if (tid < nThreads && sendbuff != recvbuff) {
|
||||
// local copy : ReduceOrCopyMulti takes an int as number of elements,
|
||||
// so we split it in blocks of 1G elements.
|
||||
int blockSize = 1<<30;
|
||||
for (size_t offset=0; offset<sendCount; offset += blockSize) {
|
||||
size_t remaining = sendCount - offset;
|
||||
if (remaining < blockSize) blockSize = remaining;
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
|
||||
sendbuff += blockSize; recvbuff += blockSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
|
||||
if (args->p2p.delta == 0) {
|
||||
if (tid < nthreads && sendbuff != recvbuff) {
|
||||
// local copy : ReduceOrCopyMulti takes an int as number of elements,
|
||||
// so we split it in blocks of 1G elements.
|
||||
int blockSize = 1<<30;
|
||||
for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
|
||||
size_t remaining = args->p2p.sendCount - offset;
|
||||
if (remaining < blockSize) blockSize = remaining;
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
|
||||
sendbuff += blockSize; recvbuff += blockSize;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
|
||||
|
||||
int nThreadsSplit = nThreads/2;
|
||||
if ((tid < nThreadsSplit) && recvCount >= 0) {
|
||||
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
|
||||
int nt = nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
|
||||
|
||||
if (recvCount == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
|
||||
int realChunkSize = min(chunkSize, recvCount-offset);
|
||||
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, recvCount-offset);
|
||||
prims.directRecv(recvbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
if ((tid >= nThreadsSplit) && sendCount >= 0) {
|
||||
int peer = (comm->rank+delta)%comm->nRanks;
|
||||
int nt = nThreads-nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
|
||||
|
||||
if (sendCount == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
|
||||
int realChunkSize = min(chunkSize, sendCount-offset);
|
||||
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, sendCount-offset);
|
||||
prims.directSend(sendbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tid -= nThreadsSegment;
|
||||
if (tid < 0) return;
|
||||
args++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
|
||||
|
||||
int nthreadsSplit = nthreads/2;
|
||||
// We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
|
||||
// receive threads, but then we define all peers to -1 since sender threads don't receive and
|
||||
// receive threads don't send.
|
||||
int peerNone[2] = {-1,-1};
|
||||
|
||||
if (tid < nthreadsSplit ) {
|
||||
const ssize_t sendSize = args->p2p.sendCount;
|
||||
if (sendSize < 0) return;
|
||||
|
||||
int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
|
||||
prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
|
||||
|
||||
if (sendSize == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
|
||||
int realChunkSize = min(stepSize, sendSize-offset);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, sendSize-offset);
|
||||
prims.directSend(sendbuff+offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
const ssize_t recvSize = args->p2p.recvCount;
|
||||
if (recvSize < 0) return;
|
||||
|
||||
int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
|
||||
|
||||
if (recvSize == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
|
||||
int realChunkSize = min(stepSize, recvSize-offset);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, recvSize-offset);
|
||||
prims.directRecv(recvbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -29,9 +29,10 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
return ncclSuccess;
|
||||
}
|
||||
else {
|
||||
struct ncclInfo info = { ncclCollGather, "Gather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
//struct ncclInfo info = { ncclCollGather, "Gather",
|
||||
// sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
// GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
|
||||
//return ncclEnqueueCheck(&info);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduce, "Reduce",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -29,9 +29,10 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
return ncclSuccess;
|
||||
}
|
||||
else {
|
||||
struct ncclInfo info = { ncclCollScatter, "Scatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
//struct ncclInfo info = { ncclCollScatter, "Scatter",
|
||||
// sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
// SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
|
||||
//return ncclEnqueueCheck(&info);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Send",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "Send",
|
||||
sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
@@ -27,7 +28,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Recv",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "Recv",
|
||||
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
|
||||
@@ -128,7 +128,7 @@ void ncclDebugInit() {
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
if (ncclDebugLevel < level) return;
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
|
||||
|
||||
// Gather the rank information. This can take > 1us so we want to make sure
|
||||
// we only do it when needed.
|
||||
@@ -145,11 +145,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
if (level == NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
else if (level == NCCL_LOG_INFO)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
|
||||
+253
-213
@@ -8,59 +8,58 @@
|
||||
#include "enqueue.h"
|
||||
#include "argcheck.h"
|
||||
#include "coll_net.h"
|
||||
#include "../graph/topo.h"
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, dtype) \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64), \
|
||||
NCCL_FUNC4(coll, op, b16)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double), \
|
||||
NCCL_FUNC4(func, redop, rccl_bfloat16)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum)
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum)
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclDevComm*);
|
||||
typedef void(*ncclKern_t)(struct ncclWorkElem first);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[1] = {
|
||||
NCCL_KERN_NAME(ncclSendRecv, copy, i8)
|
||||
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
};
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -70,12 +69,8 @@ static ncclKern_t const ncclKerns[1] = {
|
||||
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
||||
if (cgMode & 0x01) {
|
||||
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
|
||||
// These flags are to reduce the latency of using this API
|
||||
#if __HIP__
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
#else
|
||||
0));
|
||||
#endif
|
||||
// These flags are to reduce the latency of using this API
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
return ncclSuccess;
|
||||
}
|
||||
int savedDev;
|
||||
@@ -83,44 +78,62 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
hipLaunchParams* params = paramsList+i;
|
||||
CUDACHECK(hipSetDevice(cudaDevs[i]));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
|
||||
}
|
||||
CUDACHECK(hipSetDevice(savedDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
|
||||
if (channel->workCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
|
||||
struct ncclWork* w = channel->workFifo+opIndex;
|
||||
struct ncclWorkElem* e = w->elems;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
memset(w, 0, sizeof(struct ncclWork));
|
||||
// Initialize with work elem if provided
|
||||
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
|
||||
STORE(&e->active, 1);
|
||||
e->index = opIndex;
|
||||
channel->workFifoTail++;
|
||||
channel->workCount++;
|
||||
if (work) *work = w;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
// Only launch blocks where we have work to do.
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
if (comm->channels[c].collCount) params->gridDim.x = c+1;
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
||||
}
|
||||
|
||||
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
|
||||
for (int c=0; c<params->gridDim.x; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (channel->collCount == 0) {
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (activePtr[0] != 0) sched_yield();
|
||||
|
||||
c->args.p2p.delta = -1; // no-op
|
||||
c->funcIndex = FUNC_INDEX_P2P;
|
||||
c->args.comm = comm->devComm;
|
||||
c->active = 1;
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
if (channel->workCount == 0) {
|
||||
struct ncclWork* w;
|
||||
NCCLCHECK(getNextOp(channel, &w, NULL));
|
||||
struct ncclWorkElem* e = w->elems;
|
||||
e->comm = comm->devComm;
|
||||
e->funcIndex = FUNC_INDEX_P2P;
|
||||
e->p2p.nThreads = 0;
|
||||
}
|
||||
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
|
||||
STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
|
||||
}
|
||||
|
||||
// Find the first operation, choose the kernel accordingly and pass it
|
||||
// as the first argument.
|
||||
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
|
||||
struct ncclChannel* c0 = comm->channels;
|
||||
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
|
||||
struct ncclWorkElem* elem = work->elems;
|
||||
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
|
||||
// As we inline the first coll directly, we can free it immediately.
|
||||
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
|
||||
|
||||
comm->args = comm->devComm;
|
||||
params->func = (void *)ncclKerns[0];
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -131,7 +144,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
||||
bool done = false;
|
||||
while (done == false) {
|
||||
if (val >= comm->intraRanks) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (val+1 == comm->intraRanks) {
|
||||
@@ -151,7 +164,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = LOAD(ptr);
|
||||
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -212,7 +225,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
|
||||
|
||||
if (comm->launchMode == ncclComm::PARALLEL) {
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
|
||||
} else {
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
}
|
||||
@@ -222,13 +235,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
// launch and the ncclProxyStart call could cause a deadlock.
|
||||
// Also, starting the proxies after the CUDA launch seems to be better for
|
||||
// performance (latency).
|
||||
uint64_t max = 0ULL;
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->collStart = channel->collFifoTail;
|
||||
channel->collCount = 0;
|
||||
max = std::max(max, channel->workFifoTail);
|
||||
channel->workCount = 0;
|
||||
}
|
||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->workFifoTail = max;
|
||||
}
|
||||
params->gridDim.x = params->blockDim.x = 0;
|
||||
comm->lastOpCount = comm->opCount;
|
||||
comm->lastOpCount = max;
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -273,10 +291,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (info->coll == ncclCollAllToAll || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv) {
|
||||
info->algorithm = NCCL_ALGO_RING;
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
}
|
||||
if (info->algorithm == -1 || info->protocol == -1) {
|
||||
WARN("Error : no algorithm/protocol available");
|
||||
return ncclInternalError;
|
||||
@@ -284,16 +298,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
|
||||
int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
|
||||
if (info->comm->topo->type == RCCL_TOPO_4P2H_ROME && (info->coll == ncclCollAllToAll ||
|
||||
info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv))
|
||||
nc = 2;
|
||||
int nc = (info->nChannels > 0) ? info->nChannels :
|
||||
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
|
||||
int nt = comm->maxThreads[info->algorithm][info->protocol];
|
||||
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
||||
while (info->nBytes < nc*nt*threadThreshold) {
|
||||
// do not reduce channels in case of alltoall
|
||||
if (info->algorithm != NCCL_ALGO_COLLNET && info->coll != ncclCollAllToAll &&
|
||||
info->coll != ncclCollGather && info->coll != ncclCollScatter && info->coll != ncclCollAllToAllv && nc >= 2) nc--;
|
||||
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// do not reduce threads count on VEGA
|
||||
#else
|
||||
@@ -303,7 +313,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
|
||||
#endif
|
||||
info->nChannels = nc;
|
||||
info->nThreads = nt;
|
||||
@@ -312,20 +323,15 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
|
||||
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
switch (info->coll) {
|
||||
case ncclCollBroadcast:
|
||||
case ncclFuncBroadcast:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
|
||||
case ncclCollReduce:
|
||||
case ncclFuncReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
|
||||
case ncclCollReduceScatter:
|
||||
case ncclCollAllGather:
|
||||
case ncclFuncReduceScatter:
|
||||
case ncclFuncAllGather:
|
||||
info->pattern = ncclPatternRing; break;
|
||||
case ncclCollAllReduce:
|
||||
case ncclFuncAllReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
|
||||
case ncclCollGather:
|
||||
case ncclCollScatter:
|
||||
case ncclCollAllToAll:
|
||||
case ncclCollAllToAllv:
|
||||
info->pattern = ncclPatternAll; break;
|
||||
default:
|
||||
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
|
||||
return ncclInternalError;
|
||||
@@ -342,8 +348,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
case ncclPatternPipelineTo:
|
||||
case ncclPatternCollTreeUp:
|
||||
case ncclPatternCollTreeDown:
|
||||
case ncclPatternAll:
|
||||
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
|
||||
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
||||
case ncclPatternRing:
|
||||
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
case ncclPatternRingTwice:
|
||||
@@ -355,41 +360,23 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
coll->args.sendbuff = info->sendbuff;
|
||||
coll->args.recvbuff = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
work->comm = info->comm->devComm;
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
coll->args.p2p.sendCount = info->sendbytes;
|
||||
coll->args.p2p.recvCount = info->recvbytes;
|
||||
coll->args.p2p.delta = info->delta;
|
||||
coll->funcIndex = FUNC_INDEX_P2P;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||
#else
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Set nstepsPerLoop and nchunksPerLoop
|
||||
NCCLCHECK(getAlgoInfo(info));
|
||||
NCCLCHECK(getPatternInfo(info));
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
coll->args.a2av.count = info->count;
|
||||
coll->args.a2av.nChannels = info->nChannels;
|
||||
coll->args.a2av.nThreads = info->nThreads;
|
||||
} else {
|
||||
coll->args.coll.root = info->root;
|
||||
coll->args.coll.count = info->count;
|
||||
coll->args.coll.nChannels = info->nChannels;
|
||||
coll->args.coll.nThreads = info->nThreads;
|
||||
}
|
||||
work->opCount = info->comm->opCount;
|
||||
work->sendbuff = info->sendbuff;
|
||||
work->recvbuff = info->recvbuff;
|
||||
work->coll.root = info->root;
|
||||
work->coll.count = info->count;
|
||||
work->coll.nChannels = info->nChannels;
|
||||
work->nThreads = info->nThreads;
|
||||
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
|
||||
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
|
||||
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
|
||||
@@ -400,25 +387,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (info->pattern == ncclPatternTreeUpDown) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
|
||||
int nNodes = info->comm->nNodes;
|
||||
float ppn = info->comm->nRanks / (float)nNodes;
|
||||
@@ -426,7 +413,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
}
|
||||
|
||||
// Compute nSteps for proxies
|
||||
@@ -434,20 +421,20 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
|
||||
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
|
||||
int nLoops;
|
||||
if (info->pattern != ncclPatternAll)
|
||||
nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
||||
else
|
||||
nLoops = (int)(DIVUP(info->nBytes, (((size_t)((info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1))))*info->comm->nRanks*info->nchunksPerLoop*chunkEffectiveSize));
|
||||
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
||||
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
|
||||
proxyArgs->sliceSteps = sliceSteps;
|
||||
proxyArgs->chunkSteps = chunkSteps;
|
||||
proxyArgs->protocol = info->protocol;
|
||||
proxyArgs->opCount = info->comm->opCount;
|
||||
proxyArgs->dtype = info->datatype;
|
||||
proxyArgs->redOp = info->op;
|
||||
if (info->coll != ncclCollAllToAllv) TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkEffectiveSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
|
||||
// because some protocols need to transmit more than the total size, plus they sometimes
|
||||
// round up
|
||||
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
|
||||
|
||||
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
nLoops, proxyArgs->nsteps, info->comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -464,32 +451,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
|
||||
if (info->comm->nRanks == 1) {
|
||||
if (info->sendbuff != info->recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclColl coll;
|
||||
struct ncclWorkElem work;
|
||||
struct ncclProxyArgs proxyArgs;
|
||||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||
NCCLCHECK(computeColl(info, &work, &proxyArgs));
|
||||
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
|
||||
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
|
||||
int nChannels = work.coll.nChannels;
|
||||
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
|
||||
|
||||
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
|
||||
int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
|
||||
info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
|
||||
if (channel->collCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
// Proxy
|
||||
proxyArgs.channel = channel;
|
||||
// Adjust pattern for CollNet based on channel index
|
||||
@@ -497,77 +478,143 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
||||
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
|
||||
}
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
NCCLCHECK(ncclProxySaveP2p(info, channel));
|
||||
} else if (info->coll == ncclCollAllToAll || info->coll == ncclCollScatter || info->coll == ncclCollGather || info->coll == ncclCollAllToAllv) {
|
||||
NCCLCHECK(ncclProxySaveA2a(&proxyArgs, info));
|
||||
} else {
|
||||
NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
}
|
||||
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
|
||||
info->comm->myParams->gridDim.x++;
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
|
||||
memcpy(c, &coll, sizeof(struct ncclColl));
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
c->args.a2av.extra = channel->collectivesExtra + info->comm->nRanks*4*opIndex;
|
||||
memcpy(c->args.a2av.extra, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
c->args.a2av.bid = bid % coll.args.coll.nChannels;
|
||||
} else if (info->coll != ncclCollSendRecv)
|
||||
c->args.coll.bid = bid % coll.args.coll.nChannels;
|
||||
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
work.coll.bid = bid % nChannels;
|
||||
NCCLCHECK(getNextOp(channel, NULL, &work));
|
||||
}
|
||||
info->comm->opCount++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Save p2p operations in comm->p2plist. Operations will be posted to channels
|
||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
|
||||
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
|
||||
if (comm->asyncOpCount == 0) {
|
||||
return ncclSuccess;
|
||||
} else if (comm->asyncOpCount == 1) {
|
||||
// No aggregation
|
||||
struct ncclInfo* info = comm->asyncOps;
|
||||
info->nChannels = 0;
|
||||
NCCLCHECK(ncclSaveKernel(info));
|
||||
} else {
|
||||
// Aggregation
|
||||
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
|
||||
// Reduce the per-channel size if we cannot fully utilize the channels
|
||||
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
|
||||
for (int c = 0; c < comm->asyncOpCount; c++) {
|
||||
struct ncclInfo* info = comm->asyncOps+c;
|
||||
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
|
||||
NCCLCHECK(ncclSaveKernel(info));
|
||||
}
|
||||
}
|
||||
// Reset counters
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
|
||||
ncclComm_t comm = info->comm;
|
||||
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
|
||||
comm->asyncOpCount++;
|
||||
comm->asyncTotalSize += info->nBytes;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
|
||||
// during ncclGroupEnd()
|
||||
ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
struct ncclP2Plist* p2plist = &comm->p2plist;
|
||||
int peer = info->root;
|
||||
p2plist->count++;
|
||||
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
||||
if (info->recvbuff == NULL) {
|
||||
if (info->opName[0] == 'S') { // Send
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send.connected == 0) {
|
||||
p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].sendbytes = nBytes;
|
||||
p2plist->peerlist[info->root].sendbuff = info->sendbuff;
|
||||
NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
|
||||
comm->p2pSendCount++;
|
||||
} else {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
|
||||
p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].recvbytes = nBytes;
|
||||
p2plist->peerlist[info->root].recvbuff = info->recvbuff;
|
||||
NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
|
||||
comm->p2pRecvCount++;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
|
||||
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
|
||||
if (work->elems[s].p2p.nThreads == 0) return s;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
|
||||
struct ncclWorkElem* elem = work->elems+s;
|
||||
elem->comm = info->comm->devComm;
|
||||
elem->funcIndex = FUNC_INDEX_P2P;
|
||||
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
|
||||
elem->sendbuff = info->sendbuff;
|
||||
elem->recvbuff = info->recvbuff;
|
||||
elem->opCount = info->comm->lastOpCount;
|
||||
elem->p2p.sendCount = info->sendbytes;
|
||||
elem->p2p.recvCount = info->recvbytes;
|
||||
elem->p2p.delta = info->delta;
|
||||
const int nsegments = s+1;
|
||||
int nThreads = 512;
|
||||
while (nsegments*nThreads > 256) nThreads /= 2;
|
||||
//if (nThreads >= 128) nThreads += WARP_SIZE;
|
||||
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
|
||||
int channelId = info->channelId;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
|
||||
// Try to reuse last p2p operation if not full yet
|
||||
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
|
||||
struct ncclWork* w = channel->workFifo+opIndex;
|
||||
int segment = -1;
|
||||
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
|
||||
// Try to pack more segments into a single operation
|
||||
segment = getSegment(info, w);
|
||||
}
|
||||
if (segment == -1) {
|
||||
NCCLCHECK(getNextOp(channel, &w, NULL));
|
||||
segment = 0;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
|
||||
NCCLCHECK(saveP2pOp(info, w, segment));
|
||||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
// Launch asynchronously if needed
|
||||
if (ncclAsyncMode()) {
|
||||
@@ -585,19 +632,17 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
||||
NCCLCHECKGOTO(checkSetStream(info), ret, end);
|
||||
|
||||
if (info->coll == ncclCollAllToAllv)
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
|
||||
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
else
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
if (info->coll == ncclCollSendRecv) { //p2p stored separately
|
||||
if (info->coll == ncclFuncSendRecv) { //p2p stored separately
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
|
||||
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
|
||||
}
|
||||
end:
|
||||
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
|
||||
@@ -608,12 +653,7 @@ end:
|
||||
NCCLCHECK(ArgsCheck(info));
|
||||
NCCLCHECK(checkSetStream(info));
|
||||
|
||||
if (info->coll == ncclCollAllToAllv)
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
|
||||
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
else
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
|
||||
@@ -25,14 +25,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
channel->treeUp.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
|
||||
channel->treeDn.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
|
||||
channel->collTreeUp.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
|
||||
channel->collTreeDn.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
|
||||
channel->tree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
|
||||
channel->collTree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
|
||||
|
||||
int* ringIntra = ringGraph->intra+c*localRanks;
|
||||
int* treeIntra = treeGraph->intra+c*localRanks;
|
||||
@@ -46,33 +42,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
}
|
||||
if (treeIntra[i] == rank) {
|
||||
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
int parentIndex = 0;
|
||||
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
|
||||
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
|
||||
|
||||
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
|
||||
// up/down go in reverse directions
|
||||
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
|
||||
|
||||
// Down tree is common
|
||||
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
|
||||
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
|
||||
channel->treeDn.up = treeIntra[prev];
|
||||
channel->treeDn.down[0] = treeIntra[next];
|
||||
// Up tree depends on the pattern
|
||||
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
|
||||
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
|
||||
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
|
||||
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
|
||||
topoRanks->treeToParent[c] = treeIntra[parentIndex];
|
||||
topoRanks->treeToChild0[c] = treeIntra[child0Index];
|
||||
topoRanks->treeToChild1[c] = treeIntra[child1Index];
|
||||
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
|
||||
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
|
||||
}
|
||||
if (collNetIntra[i] == rank) {
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
|
||||
// CollTrees are always symmetric, i.e.
|
||||
// up/down go in reverse directions
|
||||
channel->collTreeDn.up = collNetIntra[prev];
|
||||
channel->collTreeDn.down[0] = collNetIntra[next];
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
|
||||
channel->collTreeUp.up = channel->collTreeDn.up;
|
||||
channel->collTree.up = collNetIntra[prev];
|
||||
channel->collTree.down[0] = collNetIntra[next];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
@@ -122,72 +106,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
|
||||
if (u0 != -1) tree0->up = indexes[u0];
|
||||
if (u1 != -1) tree1->up = indexes[u1];
|
||||
static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
|
||||
if (u == -1) return ncclSuccess;
|
||||
tree->up = indexes[u];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
|
||||
static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
|
||||
if (d == -1) return ncclSuccess;
|
||||
int x = 0;
|
||||
if (down[x] >= 0) x++;
|
||||
if (down[x] >= 0) {
|
||||
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
|
||||
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
|
||||
if (x == NCCL_MAX_TREE_ARITY) {
|
||||
WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (r0 != -1) down[x++] = indexes[r0];
|
||||
if (r1 != -1) down[x++] = indexes[r1];
|
||||
tree->down[x] = indexes[d];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
|
||||
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
|
||||
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
|
||||
if (tree->down[0] == upRank) tree->down[0] = -1;
|
||||
if (rank == upRank) tree->up = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
|
||||
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
|
||||
int* indexesSend, *indexesRecv;
|
||||
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
|
||||
int* ranksToParent, *ranksToChild0, *ranksToChild1;
|
||||
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
|
||||
|
||||
// Compute tree depth. Not an exact value but a good approximation in most
|
||||
// cases
|
||||
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
|
||||
|
||||
int u0, d0_0, d0_1, u1, d1_0, d1_1;
|
||||
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
|
||||
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
|
||||
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
|
||||
int root = indexesSend[node];
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
|
||||
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
|
||||
channel0->treeUp.depth = channel1->treeUp.depth = depth;
|
||||
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
|
||||
if (comm->rank == ranksToParent[node]) {
|
||||
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
|
||||
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
|
||||
}
|
||||
if (comm->rank == ranksToChild0[node]) {
|
||||
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
|
||||
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
|
||||
}
|
||||
if (comm->rank == ranksToChild1[node]) {
|
||||
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
|
||||
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
|
||||
}
|
||||
if (comm->rank == ranksToParent[node] ||
|
||||
comm->rank == ranksToChild0[node] ||
|
||||
comm->rank == ranksToChild1[node]) {
|
||||
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
|
||||
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
|
||||
}
|
||||
channel0->tree.depth = channel1->tree.depth = depth;
|
||||
}
|
||||
free(indexesSend);
|
||||
free(indexesRecv);
|
||||
free(ranksToParent);
|
||||
free(ranksToChild0);
|
||||
free(ranksToChild1);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -200,13 +178,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
|
||||
channel->collTree.up = nranks;
|
||||
}
|
||||
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
|
||||
channel->collTree.down[0] = -1;
|
||||
}
|
||||
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
}
|
||||
int recvIndex = 0; // recv GPU index is always 0
|
||||
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
|
||||
@@ -214,13 +192,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
|
||||
channel->collTree.up = nranks;
|
||||
}
|
||||
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
|
||||
channel->collTree.down[0] = -1;
|
||||
}
|
||||
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -255,35 +233,33 @@ int ncclMaxNchannels() {
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
|
||||
int nranks = comm->nRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
for (int c=0; c<nChannels;c++) {
|
||||
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
|
||||
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
|
||||
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
|
||||
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
|
||||
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
|
||||
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
|
||||
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
|
||||
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
|
||||
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
|
||||
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
|
||||
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
|
||||
}
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
|
||||
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
|
||||
@@ -317,10 +293,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
|
||||
free(ringSend);
|
||||
free(ringPrev);
|
||||
free(ringNext);
|
||||
free(treeUpRecv);
|
||||
free(treeUpSend);
|
||||
free(treeDnRecv);
|
||||
free(treeDnSend);
|
||||
free(treeToParent);
|
||||
free(treeToChild0);
|
||||
free(treeToChild1);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -166,24 +166,20 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
|
||||
// Start with path type = link type. PATH and LINK types are supposed to match.
|
||||
// Don't consider LINK_NET as we only care about the NIC->GPU path.
|
||||
int type = link->type == LINK_NET ? 0 : link->type;
|
||||
int type = link->type == LINK_NET ? LINK_LOC : link->type;
|
||||
// Differentiate between one and multiple PCI switches
|
||||
if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
|
||||
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
|
||||
// Consider a path going through the CPU as PATH_PHB
|
||||
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
|
||||
// Ignore Power CPU in an NVLink path
|
||||
if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
|
||||
link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
|
||||
// Set 1 hop NVLink as NVB
|
||||
if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
|
||||
|
||||
remPath->type = std::max(path->type, type);
|
||||
|
||||
// Add to the list for the next iteration if not already in the list
|
||||
// Disallow GPUs as intermediate steps for now
|
||||
if (remNode->type != GPU) {
|
||||
int i;
|
||||
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
|
||||
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
|
||||
}
|
||||
int i;
|
||||
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
|
||||
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -303,7 +299,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
if (l == -1) {
|
||||
char* str = getenv(levelEnv);
|
||||
if (str) {
|
||||
for (int i=0; i<PATH_NET; i++) {
|
||||
for (int i=0; i<=PATH_SYS; i++) {
|
||||
if (strcmp(str, topoPathTypeStr[i]) == 0) {
|
||||
l = i;
|
||||
break;
|
||||
@@ -325,9 +321,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
}
|
||||
|
||||
int ncclTopoUserP2pLevel = -1;
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
|
||||
*p2p = 0;
|
||||
*read = 0;
|
||||
if (read) *read = 0;
|
||||
if (intermediateRank) *intermediateRank = -1;
|
||||
|
||||
// Get GPUs from topology
|
||||
int g1, g2;
|
||||
@@ -337,7 +334,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
// GPU not found, we can't use p2p.
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
// Set intermediate GPU rank, if routing through an intermediate GPU.
|
||||
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
|
||||
if (path->count == 2) {
|
||||
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
|
||||
if (intermediateNode->type == GPU && intermediateRank) {
|
||||
*intermediateRank = intermediateNode->gpu.rank;
|
||||
}
|
||||
}
|
||||
|
||||
// In general, use P2P whenever we can.
|
||||
int p2pLevel = PATH_SYS;
|
||||
@@ -366,7 +372,7 @@ compare:
|
||||
if (path->type == PATH_NVL) {
|
||||
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
||||
// Enable P2P Read for Ampere/NVLink only
|
||||
if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
||||
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -456,8 +462,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
|
||||
// Update path when we don't want to / can't use GPU Direct P2P
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
int p2p, read;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
|
||||
if (p2p == 0) {
|
||||
// Divert all traffic through the CPU
|
||||
int cpu;
|
||||
@@ -565,8 +571,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
|
||||
// Local rank
|
||||
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
||||
if (path->type == PATH_NVL) {
|
||||
int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
|
||||
double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
|
||||
float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
@@ -600,16 +605,9 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->topo->type == RCCL_TOPO_4P2H_ROME) {
|
||||
// Adjust P2P channels on Rome
|
||||
comm->p2pnChannelsPerPeer = 2;
|
||||
comm->p2pnChannels = 2;
|
||||
}
|
||||
else {
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
}
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
|
||||
// Init channels that weren't used so far
|
||||
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
|
||||
|
||||
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
|
||||
|
||||
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||
for (int r=0; r<nrings; r++) {
|
||||
char prefix[30];
|
||||
char prefix[40];
|
||||
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
|
||||
dumpLine(prev+r*nranks, nranks, prefix);
|
||||
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
|
||||
|
||||
@@ -25,9 +25,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
|
||||
}
|
||||
return maxWidth;
|
||||
}
|
||||
static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
|
||||
float nvlinkWidth = 0.0, pciWidth = 0.0;
|
||||
for (int l=0; l<gpu->nlinks; l++) {
|
||||
struct ncclTopoLink* link = gpu->links+l;
|
||||
if (link->type == LINK_NVL) nvlinkWidth += link->width;
|
||||
if (link->type == LINK_PCI) pciWidth = link->width;
|
||||
}
|
||||
return std::max(pciWidth, nvlinkWidth);
|
||||
}
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
system->maxWidth = 0.0;
|
||||
system->type = 0;
|
||||
system->totalWidth = 0.0;
|
||||
int inter = system->nodes[NET].count;
|
||||
if (inter == 0 && system->nodes[GPU].count == 1) {
|
||||
system->maxWidth = LOC_WIDTH;
|
||||
@@ -36,6 +45,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
|
||||
system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -293,7 +303,6 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
|
||||
const uint64_t flag = 1ULL<<(graph->nChannels);
|
||||
struct ncclTopoNode* gpu;
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
|
||||
if (gpu) {
|
||||
gpu->used ^= flag;
|
||||
@@ -352,11 +361,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
float speedInterSave = graph->speedInter;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
|
||||
// Count half of the bandwidth on each of the first two GPUs
|
||||
if (step == 0) nextBackToNet = 1;
|
||||
else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
|
||||
graph->speedInter /= 2;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -493,13 +517,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
|
||||
if (system->nodes[NET].count) {
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
|
||||
else *backToNet = 1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
|
||||
else *backToNet = 0;
|
||||
*backToFirstRank = -1;
|
||||
} else {
|
||||
*backToNet = -1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -544,7 +567,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
|
||||
/* User defined graph from XML file */
|
||||
/************************************/
|
||||
|
||||
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
|
||||
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
|
||||
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int* inter = graph->inter+2*c;
|
||||
@@ -1062,7 +1085,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#else
|
||||
float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#endif
|
||||
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
|
||||
|
||||
@@ -1109,11 +1132,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// TODO: benchmark balance tree vs split tree
|
||||
//if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
#else
|
||||
// SPLIT_TREE works better on older archs.
|
||||
int ccMin;
|
||||
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
||||
if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
#endif
|
||||
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
|
||||
// First try crossnic, then decrease speed and finally increase speedIntra.
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
int pass = 1;
|
||||
int speedIndex = 0;
|
||||
while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
|
||||
@@ -1128,7 +1160,7 @@ search:
|
||||
|
||||
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
|
||||
#if 0
|
||||
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
|
||||
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
printf("%2d : ", c);
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
@@ -1138,7 +1170,8 @@ search:
|
||||
}
|
||||
#endif
|
||||
// Optimal solution, stop here
|
||||
if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
|
||||
if (time == -1) goto done;
|
||||
if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
|
||||
|
||||
if (pass == 1) {
|
||||
// First pass, we don't have a solution yet ; try other options
|
||||
@@ -1152,7 +1185,7 @@ search:
|
||||
|
||||
if (time != -1) globalTimeout += time;
|
||||
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
|
||||
if (globalTimeout < 0) goto done;
|
||||
if (globalTimeout < 0 && graph->nChannels) goto done;
|
||||
|
||||
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
|
||||
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
|
||||
@@ -1167,10 +1200,6 @@ search:
|
||||
tmpGraph.typeInter = PATH_PIX;
|
||||
|
||||
// Try a simpler tree
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
goto search;
|
||||
}
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
goto search;
|
||||
|
||||
@@ -20,18 +20,17 @@
|
||||
#endif
|
||||
#include "xml.h"
|
||||
#include "cpuset.h"
|
||||
#include <numa.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
|
||||
#else
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
@@ -226,7 +225,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
|
||||
char line[1024];
|
||||
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
|
||||
INFO(NCCL_GRAPH, "==========================================");
|
||||
@@ -515,7 +514,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
}
|
||||
}
|
||||
if (remote) {
|
||||
int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
|
||||
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
|
||||
if (remote->type != GPU) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
|
||||
@@ -600,6 +599,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
|
||||
if (node == NULL) continue;
|
||||
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
|
||||
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
|
||||
}
|
||||
@@ -614,6 +614,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(collNetGetProperties(n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
@@ -631,6 +632,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(ncclNetGetProperties(n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
@@ -639,6 +641,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
|
||||
}
|
||||
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
NCCLCHECK(ncclTopoTrimXml(xml));
|
||||
|
||||
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
|
||||
@@ -747,3 +752,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
|
||||
*count = system->nodes[NET].count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
|
||||
if (system->nodes[GPU].count == 0) return ncclInternalError;
|
||||
int min, max;
|
||||
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
|
||||
for (int g=1; g<system->nodes[GPU].count; g++) {
|
||||
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
}
|
||||
if (ccMin) *ccMin = min;
|
||||
if (ccMax) *ccMax = max;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -13,8 +13,10 @@
|
||||
#include <sched.h>
|
||||
|
||||
#define LOC_WIDTH 5000.0
|
||||
#define PASCAL_NVLINK_WIDTH 18.0
|
||||
#define VOLTA_NVLINK_WIDTH 21.0
|
||||
#define SM60_NVLINK_WIDTH 18.0
|
||||
#define SM70_NVLINK_WIDTH 21.0
|
||||
#define SM80_NVLINK_WIDTH 21.0
|
||||
#define SM86_NVLINK_WIDTH 12.0
|
||||
#define PCI_WIDTH 12.0 // PCI Gen3 x16
|
||||
#define QPI_WIDTH 6.0
|
||||
#define SKL_QPI_WIDTH 9.0
|
||||
@@ -40,20 +42,21 @@ extern const char* topoNodeTypeStr[];
|
||||
// We want link types and path types to match as much as possible
|
||||
#define LINK_LOC 0
|
||||
#define LINK_NVL 1
|
||||
#define LINK_PCI 2
|
||||
// Skipping 3 for PATH_PXB
|
||||
// Skipping 4 for PATH_PHB
|
||||
#define LINK_SYS 5
|
||||
#define LINK_NET 6
|
||||
// Skipping 2 for PATH_NVB
|
||||
#define LINK_PCI 3
|
||||
// Skipping 4 for PATH_PXB
|
||||
// Skipping 5 for PATH_PHB
|
||||
#define LINK_SYS 6
|
||||
#define LINK_NET 7
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
#define PATH_LOC 0
|
||||
#define PATH_NVL 1
|
||||
#define PATH_PIX 2
|
||||
#define PATH_PXB 3
|
||||
#define PATH_PHB 4
|
||||
#define PATH_SYS 5
|
||||
#define PATH_NET 6
|
||||
#define PATH_NVB 2
|
||||
#define PATH_PIX 3
|
||||
#define PATH_PXB 4
|
||||
#define PATH_PHB 5
|
||||
#define PATH_SYS 6
|
||||
extern const char* topoPathTypeStr[];
|
||||
|
||||
struct ncclTopoNode;
|
||||
@@ -125,6 +128,7 @@ struct ncclTopoNodeSet {
|
||||
struct ncclTopoSystem {
|
||||
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
||||
float maxWidth;
|
||||
float totalWidth;
|
||||
int type;
|
||||
};
|
||||
|
||||
@@ -141,6 +145,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
|
||||
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
|
||||
|
||||
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
|
||||
|
||||
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[type].count; i++) {
|
||||
@@ -163,4 +169,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Returns NVLink speed in GB/s
|
||||
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
|
||||
return
|
||||
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
|
||||
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
|
||||
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
|
||||
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
|
||||
SM80_NVLINK_WIDTH;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -28,7 +28,7 @@
|
||||
* / \ / \ / \ \
|
||||
* 1 3 5 7 9 11 13
|
||||
*/
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
|
||||
int up, down0, down1;
|
||||
int bit;
|
||||
for (bit=1; bit<nranks; bit<<=1) {
|
||||
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
|
||||
if (rank == 0) {
|
||||
*u = -1;
|
||||
*d0 = nranks > 1 ? bit >> 1 : -1;
|
||||
*d1 = -1;
|
||||
*d0 = -1;
|
||||
// Child rank is > 0 so it has to be our child 1, not 0.
|
||||
*d1 = nranks > 1 ? bit >> 1 : -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
up = (rank ^ bit) | (bit << 1);
|
||||
// if smaller than the parent, we are his first child, otherwise we're his second
|
||||
if (up >= nranks) up = (rank ^ bit);
|
||||
*parentChildType = (rank < up) ? 0 : 1;
|
||||
*u = up;
|
||||
|
||||
int lowbit = bit >> 1;
|
||||
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
}
|
||||
|
||||
/* Build a double binary tree. Take the previous tree for the first tree.
|
||||
* For the second tree, we use a mirror tree (if nranks is odd)
|
||||
* For the second tree, we use a mirror tree (if nranks is even)
|
||||
*
|
||||
* 8---------0---------5
|
||||
* ______/ \______ _____/ \______
|
||||
* 4 12 1 9
|
||||
* / \ / \ / \
|
||||
* 2 6 10 3 7 10
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 11 12
|
||||
* 0---------------8 3----------------11
|
||||
* ______/ \ / \______
|
||||
* 4 \ / 7
|
||||
* / \ \ / / \
|
||||
* 2 6 10 1 5 9
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 0 2 4 6 8 10
|
||||
*
|
||||
* or shift it by one rank (if nranks is even)
|
||||
* or shift it by one rank (if nranks is odd).
|
||||
*
|
||||
* 8---------0--------------9
|
||||
* ______/ \ ______/ \
|
||||
* 4 \ 5 \
|
||||
* / \ \ / \ \
|
||||
* 2 6 10 3 7 11
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 10 1
|
||||
* 0---------------8 1---------------9
|
||||
* ______/ \______ ______/ \______
|
||||
* 4 12 5 0
|
||||
* / \ / / \ /
|
||||
* 2 6 10 3 7 11
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 10 12
|
||||
*/
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
|
||||
// First tree ... use a btree
|
||||
ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
|
||||
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
|
||||
// Second tree ... mirror or shift
|
||||
if (nranks % 2 == 0) {
|
||||
if (nranks % 2 == 1) {
|
||||
// shift
|
||||
int shiftrank = (rank-1+nranks) % nranks;
|
||||
int u, d0, d1;
|
||||
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
|
||||
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
|
||||
*s1 = u == -1 ? -1 : (u+1) % nranks;
|
||||
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
|
||||
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
|
||||
} else {
|
||||
// mirror
|
||||
int u, d0, d1;
|
||||
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
|
||||
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
|
||||
*s1 = u == -1 ? -1 : nranks-1-u;
|
||||
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
|
||||
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
|
||||
|
||||
@@ -71,45 +71,66 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
};
|
||||
|
||||
// LL128 max BW (per channel) for the different collectives
|
||||
// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
|
||||
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
|
||||
// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
|
||||
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
|
||||
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
|
||||
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
#else
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
||||
#endif
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
|
||||
if (comm->nRanks <= 1) return ncclSuccess;
|
||||
int nNodes = comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
|
||||
float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
||||
int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
|
||||
double llMaxBw = llMaxBws[index1][index2];
|
||||
double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
|
||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
|
||||
comm->nRanks;
|
||||
int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
|
||||
comm->nNodes;
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
||||
nRanks;
|
||||
int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
|
||||
nNodes;
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = graphs[a]->nChannels * speed;
|
||||
|
||||
// Various model refinements
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
double maxTreeBw = comm->nNodes > 2 ?
|
||||
@@ -118,21 +139,29 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
|
||||
#else
|
||||
if (compCap80) busBw = std::min(busBw, 235.0f);
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
#endif
|
||||
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = hwLat[hw[a]][a][p];
|
||||
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
|
||||
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
|
||||
if (ringGraph->sameChannels) {
|
||||
comm->latencies[coll][a][p] += lat;
|
||||
} else {
|
||||
@@ -144,10 +173,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
} else {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
|
||||
2 * (nRanks/nNodes-1) * intraLat + interLat;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,6 +197,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int pEnable = protoEnable[p];
|
||||
@@ -178,7 +216,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||
// Only disable algo for Allreduce since others only have one
|
||||
if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
}
|
||||
|
||||
if (comm->rank == 0) {
|
||||
@@ -214,7 +252,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
|
||||
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
||||
}
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
|
||||
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
@@ -263,8 +301,16 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
|
||||
#else
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
|
||||
#endif
|
||||
*time = lat + (info->nBytes) / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -572,7 +572,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
|
||||
if (index == -1) {
|
||||
if (nvmlDev == NULL) {
|
||||
//WARN("No NVML, trying to use CUDA instead");
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
|
||||
if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
|
||||
@@ -714,6 +713,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
char* path;
|
||||
NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
|
||||
free(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -725,10 +725,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
|
||||
NCCLCHECK(wrapNvmlSymbols());
|
||||
NCCLCHECK(wrapNvmlInit());
|
||||
nvmlDevice_t nvmlDev;
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
|
||||
nvmlDevice_t nvmlDev = NULL;
|
||||
static int nvmlInit = 0;
|
||||
if (nvmlInit == 0) {
|
||||
nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
|
||||
}
|
||||
if (nvmlInit == 1) {
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -771,12 +775,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
strcpy(busId, pciSysPath+offset+1);
|
||||
NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
|
||||
if (parent == NULL) {
|
||||
NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
|
||||
NCCLCHECK(xmlSetAttr(parent, "busid", busId));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
||||
} else {
|
||||
// Virtual NIC, no PCI device, attach to first CPU
|
||||
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
|
||||
@@ -795,6 +795,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(node, "keep", &str));
|
||||
if (str && strcmp(str, "1") == 0) {
|
||||
NCCLCHECK(xmlUnsetAttr(node, "keep"));
|
||||
} else {
|
||||
// Copy nSubs and subs as they could change as we trim recursively.
|
||||
struct ncclXmlNode* subs[MAX_SUBS];
|
||||
int nSubs = node->nSubs;
|
||||
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
|
||||
for (int s=0; s<nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
|
||||
}
|
||||
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/**************************************************/
|
||||
/* Parser rules for the user-defined graph search */
|
||||
/**************************************************/
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#define XML_H_
|
||||
|
||||
// A few constraints to make the implementation easy
|
||||
#define MAX_STR_LEN 256
|
||||
#define MAX_STR_LEN 255
|
||||
#define MAX_ATTR_COUNT 16
|
||||
#define MAX_SUBS 32
|
||||
#define MAX_NODES 1024
|
||||
@@ -19,10 +19,10 @@
|
||||
#define NODE_TYPE_SINGLE 3
|
||||
|
||||
struct ncclXmlNode {
|
||||
char name[MAX_STR_LEN];
|
||||
char name[MAX_STR_LEN+1];
|
||||
struct {
|
||||
char key[MAX_STR_LEN];
|
||||
char value[MAX_STR_LEN];
|
||||
char key[MAX_STR_LEN+1];
|
||||
char value[MAX_STR_LEN+1];
|
||||
} attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
|
||||
int nAttrs;
|
||||
int type;
|
||||
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
|
||||
ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
|
||||
ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
|
||||
|
||||
/* Remove unneeded parts */
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
|
||||
|
||||
/**************/
|
||||
/* XML Struct */
|
||||
/* Functions */
|
||||
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
|
||||
*index = -1;
|
||||
const int nAttrs = node->nAttrs;
|
||||
for (int a=0; a<nAttrs; a++) {
|
||||
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
|
||||
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
|
||||
*index = a;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
|
||||
if (index == -1) return ncclSuccess;
|
||||
for (int i=index+1; i<node->nAttrs; i++) {
|
||||
strcpy(node->attrs[i-1].key, node->attrs[i].key);
|
||||
strcpy(node->attrs[i-1].value, node->attrs[i].value);
|
||||
}
|
||||
node->nAttrs--;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
|
||||
s->parent = parent;
|
||||
if (parent) parent->subs[parent->nSubs++] = s;
|
||||
strncpy(s->name, subName, MAX_STR_LEN);
|
||||
s->name[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
|
||||
node->type = NODE_TYPE_NONE;
|
||||
struct ncclXmlNode* parent = node->parent;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
int shift = 0;
|
||||
for (int s=0; s<parent->nSubs; s++) {
|
||||
if (parent->subs[s] == node) shift = 1;
|
||||
else if (shift) parent->subs[s-1] = parent->subs[s];
|
||||
}
|
||||
parent->nSubs--;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+115
-79
@@ -35,7 +35,6 @@ struct ncclInitArgs {
|
||||
};
|
||||
struct ncclCollArgs {
|
||||
ncclComm_t comm;
|
||||
int connect;
|
||||
};
|
||||
|
||||
enum ncclAsyncFuncType {
|
||||
@@ -110,6 +109,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
|
||||
}
|
||||
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupStart() {
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
|
||||
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
info.delta = delta;
|
||||
@@ -126,26 +126,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
|
||||
info.sendbytes = sendbytes;
|
||||
info.recvbytes = recvbytes;
|
||||
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
|
||||
NCCLCHECK(ncclSaveKernel(&info));
|
||||
NCCLCHECK(ncclSaveP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* ncclAsyncThreadPreconnect(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
|
||||
for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
struct ncclP2PConnect* connect = &comm->p2plist.connect;
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
|
||||
connect->nrecv[c] = 0;
|
||||
connect->nsend[c] = 0;
|
||||
}
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
|
||||
return args;
|
||||
}
|
||||
|
||||
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
|
||||
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
|
||||
int nChannels = minChannels;
|
||||
while (size > maxSize && nChannels <= maxChannels/2) {
|
||||
nChannels *= 2;
|
||||
size = DIVUP(totalSize, nChannels);
|
||||
}
|
||||
ALIGN_SIZE(size, minSize);
|
||||
return size;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
return ncclInvalidUsage;
|
||||
@@ -186,29 +192,21 @@ ncclResult_t ncclGroupEnd() {
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count != 0) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
args->coll.connect = 0;
|
||||
for (int c=0; c<comm->p2pnChannels; c++)
|
||||
args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
|
||||
if (args->coll.connect) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,56 +216,98 @@ ncclResult_t ncclGroupEnd() {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
int rank = comm->rank;
|
||||
int nRanks = comm->nRanks;
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count) {
|
||||
for (int delta=0; delta<nRanks; delta++) {
|
||||
struct ncclP2Plist* p2pSends = comm->p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Try to use all channels
|
||||
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
||||
int nChannelsMin = nChannelsMax;
|
||||
// Try to use all channels, but one channel per operation.
|
||||
while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
|
||||
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
||||
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
||||
|
||||
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
// schedule delta 0, +1, -1, +2, -2, ...
|
||||
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
|
||||
for (int d=0; d<=nRanks/4; d++) {
|
||||
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
|
||||
int index = 0;
|
||||
int delta = deltas[index];
|
||||
sched_delta:
|
||||
uint32_t from = (rank+nRanks-delta)%nRanks;
|
||||
uint32_t to = (rank+delta)%nRanks;
|
||||
struct ncclP2Pinfo* recv = p2pRecvs[from].head;
|
||||
struct ncclP2Pinfo* send = p2pSends[to].head;
|
||||
if (recv != NULL || send != NULL) {
|
||||
ssize_t totRecvBytes = -1, totSendBytes = -1;
|
||||
if (recv != NULL) totRecvBytes = recv->nbytes;
|
||||
if (send != NULL) totSendBytes = send->nbytes;
|
||||
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Split each operation on p2pnChannelsPerPeer max.
|
||||
ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
|
||||
ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
|
||||
recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
|
||||
sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
|
||||
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int remaining = 1;
|
||||
int chunk = 0;
|
||||
while (remaining) {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
remaining = 0;
|
||||
ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
|
||||
ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
|
||||
if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
|
||||
sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int sendRemaining = 1, recvRemaining = 1;
|
||||
int chunk = 0;
|
||||
do {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
ssize_t recvbytes = totRecvBytes-recvOffset;
|
||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
|
||||
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
} while (sendRemaining || recvRemaining);
|
||||
if (recv) {
|
||||
NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
|
||||
comm->p2pRecvCount--;
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
if (send) {
|
||||
NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
|
||||
comm->p2pSendCount--;
|
||||
}
|
||||
}
|
||||
index++;
|
||||
if (index == 1 && deltas[1] == deltas[0]) index++;
|
||||
if (index == 2 && deltas[2] == deltas[0]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[2]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[1]) index++;
|
||||
if (index < 4) {
|
||||
delta = deltas[index];
|
||||
goto sched_delta;
|
||||
}
|
||||
}
|
||||
p2plist->count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Collectives are done in three steps :
|
||||
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
|
||||
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
|
||||
* 2. Barrier Wait. No CUDA call is permitted
|
||||
* 3. Enqueue Events. CUDA event wait/enqueue.
|
||||
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
|
||||
* hipFree happens between 1 and 3, it could block that CUDA call and
|
||||
* cudaFree happens between 1 and 3, it could block that CUDA call and
|
||||
* prevent some ranks from launching their network threads, which would
|
||||
* prevent the NCCL call from completing, blocking the hipFree call.
|
||||
* prevent the NCCL call from completing, blocking the cudaFree call.
|
||||
*/
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
ncclComm_t comm = args->coll.comm;
|
||||
NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
@@ -304,32 +344,28 @@ group_cleanup:
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
|
||||
// Reset aggregation counters
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
// Dequeue p2p lists
|
||||
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
struct ncclP2Plist* p2pSends = comm->p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
|
||||
for (int peer=0; peer<comm->nRanks; peer++) {
|
||||
while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
|
||||
while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
|
||||
}
|
||||
channel->collFifoTail = channel->collStart;
|
||||
channel->collCount = 0;
|
||||
comm->p2pSendCount = comm->p2pRecvCount = 0;
|
||||
}
|
||||
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
|
||||
/* Free all proxy ops in state->nextOps */
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs *op, *start;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = start = state->ops;
|
||||
while (op) {
|
||||
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
|
||||
struct ncclProxyArgs* peerOp = op->nextPeer;
|
||||
while (peerOp) {
|
||||
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
|
||||
peerOp = peerOp->nextPeer;
|
||||
}
|
||||
op = op->next;
|
||||
if (op == start) break;
|
||||
pthread_mutex_lock(&state->poolMutex);
|
||||
for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
|
||||
op->next = state->pool;
|
||||
state->pool = op;
|
||||
}
|
||||
comm->opCount = comm->lastOpCount;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
pthread_mutex_unlock(&state->poolMutex);
|
||||
state->nextOps = NULL;
|
||||
|
||||
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
|
||||
comm->userStreamSet = false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
ncclResult_t bootstrapAbort(void* commState);
|
||||
#endif
|
||||
|
||||
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
|
||||
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
@@ -8,63 +8,60 @@
|
||||
#ifndef NCCL_COLLECTIVES_H_
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
|
||||
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
|
||||
? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
|
||||
: ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
|
||||
#define FUNC_INDEX_P2P 1800
|
||||
#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
|
||||
|
||||
#define NCCL_COLL_NAME(coll, op, dtype) \
|
||||
coll##_##op##_##dtype
|
||||
#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
|
||||
ncclFunction_##func##_##algo##_##proto##_##redop##_##type
|
||||
|
||||
#define NCCL_KERN_NAME(coll, op, dtype) \
|
||||
coll##Kernel_##op##_##dtype
|
||||
#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
|
||||
ncclKernel_##func##_##algo##_##proto##_##redop##_##type
|
||||
|
||||
#define NCCL_IMPL_NAME(func, algo, proto) \
|
||||
nccl##func##algo##proto
|
||||
|
||||
/* Declare all collective operations */
|
||||
#define DECL_COLL5(coll, op, dtype) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
|
||||
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
|
||||
#define DECL5(func, algo, proto, redop, type) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \
|
||||
|
||||
#define DECL_COLL4(coll, op, dtype) \
|
||||
DECL_COLL5(coll, op, dtype) \
|
||||
DECL_COLL5(coll##LL, op, dtype) \
|
||||
DECL_COLL5(coll##LL128, op, dtype)
|
||||
#define DECL4(func, algo, redop, type) \
|
||||
DECL5(func, algo, SIMPLE, redop, type) \
|
||||
DECL5(func, algo, LL, redop, type) \
|
||||
DECL5(func, algo, LL128, redop, type)
|
||||
|
||||
#define DECL_COLL3(coll, op, dtype) \
|
||||
DECL_COLL4(coll##Ring, op, dtype) \
|
||||
DECL_COLL4(coll##Tree, op, dtype) \
|
||||
DECL_COLL4(coll##CollNet, op, dtype)
|
||||
#define DECL3(func, redop, type) \
|
||||
DECL4(func, RING, redop, type) \
|
||||
DECL4(func, TREE, redop, type) \
|
||||
DECL4(func, COLLNET, redop, type)
|
||||
|
||||
#define DECL_COLL2(coll, op) \
|
||||
DECL_COLL3(coll, op, i8) \
|
||||
DECL_COLL3(coll, op, u8) \
|
||||
DECL_COLL3(coll, op, i32) \
|
||||
DECL_COLL3(coll, op, u32) \
|
||||
DECL_COLL3(coll, op, i64) \
|
||||
DECL_COLL3(coll, op, u64) \
|
||||
DECL_COLL3(coll, op, f16) \
|
||||
DECL_COLL3(coll, op, f32) \
|
||||
DECL_COLL3(coll, op, f64) \
|
||||
DECL_COLL3(coll, op, b16)
|
||||
#define DECL2(func, redop) \
|
||||
DECL3(func, redop, int8_t) \
|
||||
DECL3(func, redop, uint8_t) \
|
||||
DECL3(func, redop, int32_t) \
|
||||
DECL3(func, redop, uint32_t) \
|
||||
DECL3(func, redop, int64_t) \
|
||||
DECL3(func, redop, uint64_t) \
|
||||
DECL3(func, redop, half) \
|
||||
DECL3(func, redop, float) \
|
||||
DECL3(func, redop, double) \
|
||||
DECL3(func, redop, rccl_bfloat16)
|
||||
|
||||
#define DECL_COLL(coll) \
|
||||
DECL_COLL2(coll, sum) \
|
||||
DECL_COLL2(coll, prod) \
|
||||
DECL_COLL2(coll, min) \
|
||||
DECL_COLL2(coll, max)
|
||||
#define DECL(func) \
|
||||
DECL2(func, Sum) \
|
||||
DECL2(func, Prod) \
|
||||
DECL2(func, Min) \
|
||||
DECL2(func, Max)
|
||||
|
||||
#define DECL_ALL_COLLS \
|
||||
DECL_COLL2(ncclBroadcast, copy) \
|
||||
DECL_COLL(ncclReduce) \
|
||||
DECL_COLL2(ncclAllGather, copy) \
|
||||
DECL_COLL(ncclReduceScatter) \
|
||||
DECL_COLL(ncclAllReduce) \
|
||||
DECL_COLL5(ncclGather, copy, i8) \
|
||||
DECL_COLL5(ncclScatter, copy, i8) \
|
||||
DECL_COLL5(ncclAllToAll, copy, i8) \
|
||||
DECL_COLL5(ncclAllToAllv, copy, i8) \
|
||||
DECL_COLL5(ncclSendRecv, copy, i8) \
|
||||
#define DECL_ALL \
|
||||
DECL2(Broadcast, Sum) \
|
||||
DECL(Reduce) \
|
||||
DECL2(AllGather, Sum) \
|
||||
DECL(ReduceScatter) \
|
||||
DECL(AllReduce) \
|
||||
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
|
||||
|
||||
DECL_ALL_COLLS
|
||||
DECL_ALL
|
||||
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
||||
@@ -84,13 +81,4 @@ DECL_ALL_COLLS
|
||||
#define REDUCE_SLICESTEPS 1
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
#define SENDRECV_SLICEFACTOR 1
|
||||
#define GATHER_SLICESTEPS 4
|
||||
#define GATHER_CHUNKSTEPS 4
|
||||
#define SCATTER_SLICESTEPS 4
|
||||
#define SCATTER_CHUNKSTEPS 4
|
||||
#define ALLTOALL_SLICESTEPS 4
|
||||
#define ALLTOALL_CHUNKSTEPS 4
|
||||
#define ALLTOALLV_SLICESTEPS 4
|
||||
#define ALLTOALLV_CHUNKSTEPS 4
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,8 +52,8 @@ struct ncclRecvMem {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
void* ptrsFifo[NCCL_STEPS];
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
@@ -67,6 +67,10 @@ struct ncclComm {
|
||||
struct ncclTopoSystem* topo;
|
||||
|
||||
void* bootstrap;
|
||||
// Bitmasks for ncclTransportP2pSetup
|
||||
int connect;
|
||||
uint32_t* connectSend;
|
||||
uint32_t* connectRecv;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
@@ -131,8 +135,8 @@ struct ncclComm {
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclDevComm* args;
|
||||
struct ncclDevComm** argsptr;
|
||||
struct ncclWorkElem args;
|
||||
void* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
@@ -140,8 +144,17 @@ struct ncclComm {
|
||||
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
|
||||
// Store info of async operations
|
||||
struct ncclInfo* asyncOps;
|
||||
int asyncOpCount;
|
||||
size_t asyncTotalSize;
|
||||
|
||||
//list of async p2p operation queued in a group semantics
|
||||
struct ncclP2Plist p2plist;
|
||||
struct ncclP2Plist* p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs;
|
||||
int p2pSendCount;
|
||||
int p2pRecvCount;
|
||||
|
||||
// RCCL AllToAll/Scatter/Gather API
|
||||
bool alltoallDisable;
|
||||
|
||||
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
#include "alloc.h"
|
||||
#include "utils.h"
|
||||
#include "param.h"
|
||||
#include "nvtx_stub.h"
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
|
||||
|
||||
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
|
||||
|
||||
ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
uint32_t cpumasks[CPU_SET_N_U32];
|
||||
int m = CPU_SET_N_U32-1;
|
||||
cpumasks[m] = 0;
|
||||
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
int c = 0;
|
||||
uint8_t* m8 = (uint8_t*)mask;
|
||||
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
|
||||
|
||||
@@ -23,8 +23,8 @@
|
||||
#endif
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
|
||||
#define NCCL_ALGO_TREE 0
|
||||
@@ -59,6 +59,7 @@ union ncclLLFifoLine {
|
||||
#define WARP_SIZE 64
|
||||
#define MAXCHANNELS 32
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_LINES_PER_THREAD 8
|
||||
#ifdef TEST_LL_CLEANUP
|
||||
@@ -72,7 +73,7 @@ union ncclLLFifoLine {
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
#define NCCL_LL128_LINESIZE 64
|
||||
#define NCCL_LL128_LINESIZE 128
|
||||
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
|
||||
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
@@ -83,15 +84,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
#define NCCL_DIRECT_GPU 0x01
|
||||
#define NCCL_DIRECT_NIC 0x10
|
||||
|
||||
#define MAXBARRIERS 2
|
||||
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
@@ -99,9 +97,11 @@ struct ncclConnInfo {
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
|
||||
int direct; // Direct communication
|
||||
int shared; // Buffers are shared
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
int *sizesFifo; // Sizes fifo from GPU to proxy
|
||||
void* *ptrsFifo; // Buffer fifo from proxy to GPU
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
uint64_t llLastCleaning;
|
||||
@@ -110,7 +110,6 @@ struct ncclConnInfo {
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -151,68 +150,53 @@ struct ncclDevComm;
|
||||
|
||||
#pragma pack(push) /* push current alignment to stack */
|
||||
#pragma pack(4) /* set alignment to 4 bytes boundary */
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclDevComm* comm;
|
||||
uint64_t opCount;
|
||||
#define NCCL_MAX_WORK_ELEMENTS 2
|
||||
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
|
||||
|
||||
/* ncclWork is to be a power of two, currently 8x64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclWorkElem. */
|
||||
struct ncclWorkElem {
|
||||
// Header
|
||||
struct ncclDevComm* comm;
|
||||
uint16_t nThreads;
|
||||
uint16_t funcIndex;
|
||||
uint16_t index;
|
||||
uint16_t active;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
|
||||
// Op-specific fields. Make sure the common part stays the
|
||||
// same on all structs of the union
|
||||
uint64_t opCount;
|
||||
// Op-specific fields.
|
||||
union {
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
} common;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint32_t root;
|
||||
size_t count;
|
||||
size_t lastChunkSize;
|
||||
} coll;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint16_t unused;
|
||||
int32_t delta;
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
} p2p;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
size_t count;
|
||||
size_t* extra;
|
||||
} a2av;
|
||||
};
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
} coll;
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
int32_t delta;
|
||||
uint16_t nThreads;
|
||||
} p2p;
|
||||
uint64_t align[3];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
struct ncclWork {
|
||||
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
|
||||
};
|
||||
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree treeUp;
|
||||
struct ncclTree treeDn;
|
||||
struct ncclTree collTreeUp;
|
||||
struct ncclTree collTreeDn;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collTree;
|
||||
|
||||
int id;
|
||||
|
||||
@@ -221,16 +205,10 @@ struct ncclChannel {
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
size_t* collectivesExtra;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
struct ncclWork* workFifo;
|
||||
int workCount;
|
||||
uint64_t workFifoTail; // Only used by CPU
|
||||
|
||||
uint32_t* sync;
|
||||
uint64_t* barrier;
|
||||
uint64_t* barrier_next;
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct timeval tvs;
|
||||
uint64_t sizes;
|
||||
@@ -288,9 +266,11 @@ struct ncclProf {
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
typedef enum {
|
||||
ncclCollTraceNotReady,
|
||||
ncclCollTraceKernelLaunchType,
|
||||
ncclCollTraceCollEndType,
|
||||
ncclCollTraceAbortType
|
||||
ncclCollTraceAbortType,
|
||||
ncclCollTraceDataType
|
||||
} ncclCollTraceDataType_t;
|
||||
|
||||
struct ncclCollTrace {
|
||||
@@ -304,7 +284,7 @@ struct ncclCollTrace {
|
||||
};
|
||||
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
|
||||
|
||||
#define COLLTRACE_NUM_ITEMS 1024
|
||||
#define COLLTRACE_NUM_ITEMS 8192
|
||||
#endif
|
||||
|
||||
struct ncclDevComm {
|
||||
|
||||
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
|
||||
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
// Set CPU affinity
|
||||
@@ -45,15 +45,16 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
|
||||
#define NCCL_TOPO_CPU_TYPE_ZEN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_ROME 4
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
// Init search. Needs to be done before calling ncclTopoCompute
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
|
||||
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
@@ -84,17 +85,16 @@ struct ncclTopoRanks {
|
||||
int ringSend[MAXCHANNELS];
|
||||
int ringPrev[MAXCHANNELS];
|
||||
int ringNext[MAXCHANNELS];
|
||||
int treeUpRecv[MAXCHANNELS];
|
||||
int treeUpSend[MAXCHANNELS];
|
||||
int treeDnRecv[MAXCHANNELS];
|
||||
int treeDnSend[MAXCHANNELS];
|
||||
int treeToParent[MAXCHANNELS];
|
||||
int treeToChild0[MAXCHANNELS];
|
||||
int treeToChild1[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
|
||||
@@ -20,8 +20,7 @@ typedef enum {
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollTreeUp,
|
||||
ncclPatternCollTreeDown,
|
||||
ncclPatternAll
|
||||
ncclPatternCollTreeDown
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
@@ -40,11 +39,6 @@ struct ncclInfo {
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// For alltoallv
|
||||
const size_t *sendcounts;
|
||||
const size_t *sdispls;
|
||||
const size_t *recvcounts;
|
||||
const size_t *rdispls;
|
||||
// Computed later
|
||||
int algorithm;
|
||||
int protocol;
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
#define NCCL_PTR_HOST 0x1
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
@@ -29,9 +32,9 @@ typedef struct {
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
}ncclNetProperties_v3_t;
|
||||
}ncclNetProperties_v4_t;
|
||||
|
||||
typedef ncclNetProperties_v3_t ncclNetProperties_t;
|
||||
typedef ncclNetProperties_v4_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
@@ -41,7 +44,7 @@ typedef struct {
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
@@ -62,7 +65,7 @@ typedef struct {
|
||||
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
|
||||
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
@@ -70,11 +73,11 @@ typedef struct {
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v3_t;
|
||||
} ncclNet_v4_t;
|
||||
|
||||
typedef ncclNet_v3_t ncclNet_t;
|
||||
typedef ncclNet_v4_t ncclNet_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
@@ -85,7 +88,7 @@ typedef struct {
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
@@ -105,17 +108,17 @@ typedef struct {
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v3_t;
|
||||
} ncclCollNet_v4_t;
|
||||
|
||||
typedef ncclCollNet_v3_t ncclCollNet_t;
|
||||
typedef ncclCollNet_v4_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
|
||||
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
|
||||
NVMLCHECK(nvmlDeviceGetIndex(device, index));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
|
||||
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
|
||||
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
|
||||
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
|
||||
return ncclSuccess;
|
||||
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
|
||||
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
|
||||
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
|
||||
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
|
||||
return ncclSuccess;
|
||||
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
|
||||
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
|
||||
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||
|
||||
#endif // NVML_DIRECT
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVTX_H_
|
||||
#define NCCL_NVTX_H_
|
||||
|
||||
#include "nvtx3.hpp"
|
||||
|
||||
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||
|
||||
#endif
|
||||
Plik diff jest za duży
Load Diff
Plik diff jest za duży
Load Diff
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDA_V3
|
||||
#define NVTOOLSEXT_CUDA_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for CUDA Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate CUDA resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_CUDA 4
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for CUDA
|
||||
*/
|
||||
typedef enum nvtxResourceCUDAType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
|
||||
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
|
||||
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
|
||||
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
|
||||
} nvtxResourceCUDAType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA device.
|
||||
*
|
||||
* Allows the user to associate a CUDA device with a user-provided name.
|
||||
*
|
||||
* \param device - The handle of the CUDA device to name.
|
||||
* \param name - The name of the CUDA device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA context.
|
||||
*
|
||||
* Allows the user to associate a CUDA context with a user-provided name.
|
||||
*
|
||||
* \param context - The handle of the CUDA context to name.
|
||||
* \param name - The name of the CUDA context.
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
|
||||
* if ( CUDA_SUCCESS != status )
|
||||
* goto Error;
|
||||
* nvtxNameCuContext(cuContext, "CTX_NAME");
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA stream.
|
||||
*
|
||||
* Allows the user to associate a CUDA stream with a user-provided name.
|
||||
*
|
||||
* \param stream - The handle of the CUDA stream to name.
|
||||
* \param name - The name of the CUDA stream.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA event.
|
||||
*
|
||||
* Allows the user to associate a CUDA event with a user-provided name.
|
||||
*
|
||||
* \param event - The handle of the CUDA event to name.
|
||||
* \param name - The name of the CUDA event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameCuDevice nvtxNameCuDeviceW
|
||||
#define nvtxNameCuContext nvtxNameCuContextW
|
||||
#define nvtxNameCuStream nvtxNameCuStreamW
|
||||
#define nvtxNameCuEvent nvtxNameCuEventW
|
||||
#else
|
||||
#define nvtxNameCuDevice nvtxNameCuDeviceA
|
||||
#define nvtxNameCuContext nvtxNameCuContextA
|
||||
#define nvtxNameCuStream nvtxNameCuStreamA
|
||||
#define nvtxNameCuEvent nvtxNameCuEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplCuda_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDA
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_CUDA_V3 */
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "driver_types.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDART_V3
|
||||
#define NVTOOLSEXT_CUDART_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for CUDA Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate CUDA resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_CUDART 5
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for CUDART
|
||||
*/
|
||||
typedef enum nvtxResourceCUDARTType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
|
||||
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
|
||||
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
|
||||
} nvtxResourceCUDARTType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA device.
|
||||
*
|
||||
* Allows the user to associate a CUDA device with a user-provided name.
|
||||
*
|
||||
* \param device - The id of the CUDA device to name.
|
||||
* \param name - The name of the CUDA device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA stream.
|
||||
*
|
||||
* Allows the user to associate a CUDA stream with a user-provided name.
|
||||
*
|
||||
* \param stream - The handle of the CUDA stream to name.
|
||||
* \param name - The name of the CUDA stream.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA event.
|
||||
*
|
||||
* Allows the user to associate a CUDA event with a user-provided name.
|
||||
*
|
||||
* \param event - The handle of the CUDA event to name.
|
||||
* \param name - The name of the CUDA event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameCudaDevice nvtxNameCudaDeviceW
|
||||
#define nvtxNameCudaStream nvtxNameCudaStreamW
|
||||
#define nvtxNameCudaEvent nvtxNameCudaEventW
|
||||
#else
|
||||
#define nvtxNameCudaDevice nvtxNameCudaDeviceA
|
||||
#define nvtxNameCudaStream nvtxNameCudaStreamA
|
||||
#define nvtxNameCudaEvent nvtxNameCudaEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDART
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_CUDART_V3 */
|
||||
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#ifndef NVTOOLSEXT_OPENCL_V3
|
||||
#define NVTOOLSEXT_OPENCL_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for OpenCL Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate OpenCL resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for OpenCL
|
||||
*/
|
||||
typedef enum nvtxResourceOpenCLType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
|
||||
} nvtxResourceOpenCLType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL device.
|
||||
*
|
||||
* Allows to associate an OpenCL device with a user-provided name.
|
||||
*
|
||||
* \param device - The handle of the OpenCL device to name.
|
||||
* \param name - The name of the OpenCL device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL context.
|
||||
*
|
||||
* Allows to associate an OpenCL context with a user-provided name.
|
||||
*
|
||||
* \param context - The handle of the OpenCL context to name.
|
||||
* \param name - The name of the OpenCL context.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL command queue.
|
||||
*
|
||||
* Allows to associate an OpenCL command queue with a user-provided name.
|
||||
*
|
||||
* \param command_queue - The handle of the OpenCL command queue to name.
|
||||
* \param name - The name of the OpenCL command queue.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL memory object.
|
||||
*
|
||||
* Allows to associate an OpenCL memory object with a user-provided name.
|
||||
*
|
||||
* \param memobj - The handle of the OpenCL memory object to name.
|
||||
* \param name - The name of the OpenCL memory object.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL sampler.
|
||||
*
|
||||
* Allows to associate an OpenCL sampler with a user-provided name.
|
||||
*
|
||||
* \param sampler - The handle of the OpenCL sampler to name.
|
||||
* \param name - The name of the OpenCL sampler.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL program.
|
||||
*
|
||||
* Allows to associate an OpenCL program with a user-provided name.
|
||||
*
|
||||
* \param program - The handle of the OpenCL program to name.
|
||||
* \param name - The name of the OpenCL program.
|
||||
*
|
||||
* \code
|
||||
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
|
||||
* (const char **) &cSourceCL, &program_length, &ciErrNum);
|
||||
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL event.
|
||||
*
|
||||
* Allows to associate an OpenCL event with a user-provided name.
|
||||
*
|
||||
* \param evnt - The handle of the OpenCL event to name.
|
||||
* \param name - The name of the OpenCL event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameClDevice nvtxNameClDeviceW
|
||||
#define nvtxNameClContext nvtxNameClContextW
|
||||
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
|
||||
#define nvtxNameClMemObject nvtxNameClMemObjectW
|
||||
#define nvtxNameClSampler nvtxNameClSamplerW
|
||||
#define nvtxNameClProgram nvtxNameClProgramW
|
||||
#define nvtxNameClEvent nvtxNameClEventW
|
||||
#else
|
||||
#define nvtxNameClDevice nvtxNameClDeviceA
|
||||
#define nvtxNameClContext nvtxNameClContextA
|
||||
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
|
||||
#define nvtxNameClMemObject nvtxNameClMemObjectA
|
||||
#define nvtxNameClSampler nvtxNameClSamplerA
|
||||
#define nvtxNameClProgram nvtxNameClProgramA
|
||||
#define nvtxNameClEvent nvtxNameClEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_OPENCL
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_OPENCL_V3 */
|
||||
@@ -0,0 +1,382 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_SYNC_V3
|
||||
#define NVTOOLSEXT_SYNC_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
|
||||
/** \endcond */
|
||||
|
||||
|
||||
/**
|
||||
* \page PAGE_SYNCHRONIZATION Synchronization
|
||||
*
|
||||
* This section covers a subset of the API that allow users to track additional
|
||||
* synchronization details of their application. Naming OS synchronization primitives
|
||||
* may allow users to better understand the data collected by traced synchronization
|
||||
* APIs. Additionally, a user defined synchronization object can allow the users to
|
||||
* to tell the tools when the user is building their own synchronization system
|
||||
* that do not rely on the OS to provide behaviors and instead use techniques like
|
||||
* atomic operations and spinlocks.
|
||||
*
|
||||
* See module \ref SYNCHRONIZATION for details.
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* class MyMutex
|
||||
* {
|
||||
* volatile long bLocked;
|
||||
* nvtxSyncUser_t hSync;
|
||||
* public:
|
||||
* MyMutex(const char* name, nvtxDomainHandle_t d){
|
||||
* bLocked = 0;
|
||||
*
|
||||
* nvtxSyncUserAttributes_t attribs = { 0 };
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
|
||||
* attribs.message.ascii = name;
|
||||
* hSync = nvtxDomainSyncUserCreate(d, &attribs);
|
||||
* }
|
||||
*
|
||||
* ~MyMutex() {
|
||||
* nvtxDomainSyncUserDestroy(hSync);
|
||||
* }
|
||||
*
|
||||
* bool Lock() {
|
||||
* nvtxDomainSyncUserAcquireStart(hSync);
|
||||
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
|
||||
|
||||
* if (acquired) {
|
||||
* nvtxDomainSyncUserAcquireSuccess(hSync);
|
||||
* }
|
||||
* else {
|
||||
* nvtxDomainSyncUserAcquireFailed(hSync);
|
||||
* }
|
||||
* return acquired;
|
||||
* }
|
||||
|
||||
* void Unlock() {
|
||||
* nvtxDomainSyncUserReleasing(hSync);
|
||||
* bLocked = false;
|
||||
* }
|
||||
* };
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
|
||||
#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
|
||||
/** \endcond */
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \defgroup SYNCHRONIZATION Synchronization
|
||||
* See page \ref PAGE_SYNCHRONIZATION.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** \brief Resource type values for OSs with POSIX Thread API support
|
||||
*/
|
||||
typedef enum nvtxResourceSyncPosixThreadType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
|
||||
} nvtxResourceSyncPosixThreadType_t;
|
||||
|
||||
/** \brief Resource type values for Windows OSs
|
||||
*/
|
||||
typedef enum nvtxResourceSyncWindowsType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
|
||||
} nvtxResourceSyncWindowsType_t;
|
||||
|
||||
/** \brief Resource type values for Linux and Linux derived OSs such as Android
|
||||
* \sa
|
||||
* ::nvtxResourceSyncPosixThreadType_t
|
||||
*/
|
||||
typedef enum nvtxResourceSyncLinuxType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
|
||||
} nvtxResourceSyncLinuxType_t;
|
||||
|
||||
/** \brief Resource type values for Android come from Linux.
|
||||
* \sa
|
||||
* ::nvtxResourceSyncLinuxType_t
|
||||
* ::nvtxResourceSyncPosixThreadType_t
|
||||
*/
|
||||
typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
|
||||
|
||||
/** \brief User Defined Synchronization Object Handle .
|
||||
* \anchor SYNCUSER_HANDLE_STRUCTURE
|
||||
*
|
||||
* This structure is opaque to the user and is used as a handle to reference
|
||||
* a user defined syncrhonization object. The tools will return a pointer through the API for the application
|
||||
* to hold on it's behalf to reference the string in the future.
|
||||
*
|
||||
*/
|
||||
typedef struct nvtxSyncUser* nvtxSyncUser_t;
|
||||
|
||||
/** \brief User Defined Synchronization Object Attributes Structure.
|
||||
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
|
||||
*
|
||||
* This structure is used to describe the attributes of a user defined synchronization
|
||||
* object. The layout of the structure is defined by a specific version of the tools
|
||||
* extension library and can change between different versions of the Tools Extension
|
||||
* library.
|
||||
*
|
||||
* \par Initializing the Attributes
|
||||
*
|
||||
* The caller should always perform the following three tasks when using
|
||||
* attributes:
|
||||
* <ul>
|
||||
* <li>Zero the structure
|
||||
* <li>Set the version field
|
||||
* <li>Set the size field
|
||||
* </ul>
|
||||
*
|
||||
* Zeroing the structure sets all the event attributes types and values
|
||||
* to the default value.
|
||||
*
|
||||
* The version and size field are used by the Tools Extension
|
||||
* implementation to handle multiple versions of the attributes structure.
|
||||
*
|
||||
* It is recommended that the caller use one of the following to methods
|
||||
* to initialize the event attributes structure:
|
||||
*
|
||||
* \par Method 1: Initializing nvtxEventAttributes for future compatibility
|
||||
* \code
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
* \endcode
|
||||
*
|
||||
* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
|
||||
* \code
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = 1;
|
||||
* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
|
||||
* \endcode
|
||||
*
|
||||
* If the caller uses Method 1 it is critical that the entire binary
|
||||
* layout of the structure be configured to 0 so that all fields
|
||||
* are initialized to the default value.
|
||||
*
|
||||
* The caller should either use both NVTX_VERSION and
|
||||
* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
|
||||
* and a versioned type (Method 2). Using a mix of the two methods
|
||||
* will likely cause either source level incompatibility or binary
|
||||
* incompatibility in the future.
|
||||
*
|
||||
* \par Settings Attribute Types and Values
|
||||
*
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* // Initialize
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
*
|
||||
* // Configure the Attributes
|
||||
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
|
||||
* attribs.message.ascii = "Example";
|
||||
* \endcode
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
*/
|
||||
typedef struct nvtxSyncUserAttributes_v0
|
||||
{
|
||||
/**
|
||||
* \brief Version flag of the structure.
|
||||
*
|
||||
* Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
|
||||
* supported in this header file. This can optionally be overridden to
|
||||
* another version of the tools extension library.
|
||||
*/
|
||||
uint16_t version;
|
||||
|
||||
/**
|
||||
* \brief Size of the structure.
|
||||
*
|
||||
* Needs to be set to the size in bytes of the event attribute
|
||||
* structure used to specify the event.
|
||||
*/
|
||||
uint16_t size;
|
||||
|
||||
/** \brief Message type specified in this attribute structure.
|
||||
*
|
||||
* Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
|
||||
* "message" field.
|
||||
*
|
||||
* Default Value is NVTX_MESSAGE_UNKNOWN
|
||||
*/
|
||||
int32_t messageType; /* nvtxMessageType_t */
|
||||
|
||||
/** \brief Message assigned to this attribute structure.
|
||||
*
|
||||
* The text message that is attached to an event.
|
||||
*/
|
||||
nvtxMessageValue_t message;
|
||||
|
||||
} nvtxSyncUserAttributes_v0;
|
||||
|
||||
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Create a user defined synchronization object
|
||||
* This is used to track non-OS synchronization working with spinlocks and atomics
|
||||
*
|
||||
* \param domain - Domain to own the resource
|
||||
* \param attribs - A structure to assign multiple attributes to the object.
|
||||
*
|
||||
* \return A handle that represents the newly created user defined synchronization object.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Destroy a user defined synchronization object
|
||||
* This is used to track non-OS synchronization working with spinlocks and atomics
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireStart
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of success in acquiring a user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireStart.
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of releasing a reservation on user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
|
||||
|
||||
|
||||
/** @} */ /*END defgroup*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplSync_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_SYNC
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_SYNC_V3 */
|
||||
@@ -0,0 +1,438 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
/* ---- Include required platform headers ---- */
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <Windows.h>
|
||||
|
||||
#else
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include <android/api-level.h>
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__CYGWIN__)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#endif
|
||||
|
||||
/* ---- Define macros used in this file ---- */
|
||||
|
||||
#define NVTX_INIT_STATE_FRESH 0
|
||||
#define NVTX_INIT_STATE_STARTED 1
|
||||
#define NVTX_INIT_STATE_COMPLETE 2
|
||||
|
||||
#ifdef NVTX_DEBUG_PRINT
|
||||
#ifdef __ANDROID__
|
||||
#include <android/log.h>
|
||||
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
|
||||
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
|
||||
#else
|
||||
#include <stdio.h>
|
||||
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
|
||||
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
|
||||
#endif
|
||||
#else /* !defined(NVTX_DEBUG_PRINT) */
|
||||
#define NVTX_ERR(...)
|
||||
#define NVTX_INFO(...)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(hidden)
|
||||
#endif
|
||||
|
||||
/* ---- Forward declare all functions referenced in globals ---- */
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
||||
NvtxCallbackModule module,
|
||||
NvtxFunctionTable* out_table,
|
||||
unsigned int* out_size);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
|
||||
uint32_t version);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
|
||||
uint32_t exportTableId);
|
||||
|
||||
#include "nvtxInitDecls.h"
|
||||
|
||||
/* ---- Define all globals ---- */
|
||||
|
||||
typedef struct nvtxGlobals_t
|
||||
{
|
||||
volatile unsigned int initState;
|
||||
NvtxExportTableCallbacks etblCallbacks;
|
||||
NvtxExportTableVersionInfo etblVersionInfo;
|
||||
|
||||
/* Implementation function pointers */
|
||||
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
|
||||
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
|
||||
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
|
||||
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
|
||||
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
|
||||
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
|
||||
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
|
||||
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
|
||||
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
|
||||
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
|
||||
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
|
||||
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
|
||||
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
|
||||
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
|
||||
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
|
||||
|
||||
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
|
||||
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
|
||||
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
|
||||
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
|
||||
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
|
||||
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
|
||||
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
|
||||
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
|
||||
|
||||
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
|
||||
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
|
||||
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
|
||||
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
|
||||
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
|
||||
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
|
||||
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
|
||||
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
|
||||
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
|
||||
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
|
||||
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
|
||||
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
|
||||
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
|
||||
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
|
||||
|
||||
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
|
||||
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
|
||||
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
|
||||
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
|
||||
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
|
||||
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
|
||||
|
||||
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
|
||||
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
|
||||
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
|
||||
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
|
||||
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
|
||||
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
|
||||
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
|
||||
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
|
||||
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
|
||||
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
|
||||
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
|
||||
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
|
||||
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
|
||||
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
|
||||
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
|
||||
|
||||
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
|
||||
/* Tables of function pointers -- Extra null added to the end to ensure
|
||||
* a crash instead of silent corruption if a tool reads off the end. */
|
||||
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
|
||||
} nvtxGlobals_t;
|
||||
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
|
||||
{
|
||||
NVTX_INIT_STATE_FRESH,
|
||||
|
||||
{
|
||||
sizeof(NvtxExportTableCallbacks),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
|
||||
},
|
||||
{
|
||||
sizeof(NvtxExportTableVersionInfo),
|
||||
NVTX_VERSION,
|
||||
0,
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
|
||||
},
|
||||
|
||||
/* Implementation function pointers */
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
|
||||
|
||||
/* Tables of function pointers */
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
/* ---- Define static inline implementations of core API functions ---- */
|
||||
|
||||
#include "nvtxImplCore.h"
|
||||
|
||||
/* ---- Define implementations of export table functions ---- */
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
||||
NvtxCallbackModule module,
|
||||
NvtxFunctionTable* out_table,
|
||||
unsigned int* out_size)
|
||||
{
|
||||
unsigned int bytes = 0;
|
||||
NvtxFunctionTable table = (NvtxFunctionTable)0;
|
||||
|
||||
switch (module)
|
||||
{
|
||||
case NVTX_CB_MODULE_CORE:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CUDA:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
|
||||
break;
|
||||
case NVTX_CB_MODULE_OPENCL:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CUDART:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CORE2:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
|
||||
break;
|
||||
case NVTX_CB_MODULE_SYNC:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
|
||||
break;
|
||||
default: return 0;
|
||||
}
|
||||
|
||||
if (out_size)
|
||||
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
|
||||
|
||||
if (out_table)
|
||||
*out_table = table;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
|
||||
{
|
||||
switch (exportTableId)
|
||||
{
|
||||
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
|
||||
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
|
||||
{
|
||||
/* Reserved for custom implementations to resolve problems with tools */
|
||||
(void)version;
|
||||
}
|
||||
|
||||
/* ---- Define implementations of init versions of all API functions ---- */
|
||||
|
||||
#include "nvtxInitDefs.h"
|
||||
|
||||
/* ---- Define implementations of initialization functions ---- */
|
||||
|
||||
#include "nvtxInit.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,307 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(eventAttrib);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(message);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(message);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(id);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)();
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(threadId, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(threadId, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, eventAttrib);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, id);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, attribs);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxResourceHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(resource);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, string);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxStringHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, string);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxStringHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxDomainHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxDomainHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(reserved);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_CUDART
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_CUDA
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_OPENCL
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(command_queue, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(command_queue, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(memobj, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(memobj, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(sampler, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(sampler, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(program, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(program, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(evnt, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(evnt, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_SYNC
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
|
||||
|
||||
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, attribs);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxSyncUser_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,312 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
/* ---- Platform-independent helper definitions and functions ---- */
|
||||
|
||||
/* Prefer macros over inline functions to reduce symbol resolution at link time */
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define NVTX_PATHCHAR wchar_t
|
||||
#define NVTX_STR(x) L##x
|
||||
#define NVTX_GETENV _wgetenv
|
||||
#define NVTX_BUFSIZE MAX_PATH
|
||||
#define NVTX_DLLHANDLE HMODULE
|
||||
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
|
||||
#define NVTX_DLLFUNC GetProcAddress
|
||||
#define NVTX_DLLCLOSE FreeLibrary
|
||||
#define NVTX_YIELD() SwitchToThread()
|
||||
#define NVTX_MEMBAR() MemoryBarrier()
|
||||
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
|
||||
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
|
||||
#elif defined(__GNUC__)
|
||||
#define NVTX_PATHCHAR char
|
||||
#define NVTX_STR(x) x
|
||||
#define NVTX_GETENV getenv
|
||||
#define NVTX_BUFSIZE PATH_MAX
|
||||
#define NVTX_DLLHANDLE void*
|
||||
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
|
||||
#define NVTX_DLLFUNC dlsym
|
||||
#define NVTX_DLLCLOSE dlclose
|
||||
#define NVTX_YIELD() sched_yield()
|
||||
#define NVTX_MEMBAR() __sync_synchronize()
|
||||
/* Ensure full memory barrier for atomics, to match Windows functions */
|
||||
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
|
||||
#else
|
||||
#error The library does not support your configuration!
|
||||
#endif
|
||||
|
||||
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
|
||||
#if defined(_WIN32)
|
||||
/* TODO */
|
||||
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||
#else
|
||||
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||
#endif
|
||||
|
||||
/* Define this to 1 for platforms that support environment variables */
|
||||
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
|
||||
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
|
||||
#define NVTX_SUPPORT_ENV_VARS 1
|
||||
|
||||
/* Define this to 1 for platforms that support dynamic/shared libraries */
|
||||
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
|
||||
|
||||
/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
|
||||
* and this will override any dynamic injection. Useful for platforms where dynamic
|
||||
* injection is not available. Since weak symbols not explicitly marked extern are
|
||||
* guaranteed to be initialized to zero if no definitions are found by the linker, the
|
||||
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
|
||||
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
|
||||
/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
|
||||
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
|
||||
* does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
|
||||
* injection library. */
|
||||
__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
|
||||
#else
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
|
||||
#endif
|
||||
|
||||
/* This function tries to find or load an NVTX injection library and get the
|
||||
* address of its InitializeInjection2 function. If such a function pointer
|
||||
* is found, it is called, and passed the address of this NVTX instance's
|
||||
* nvtxGetExportTable function, so the injection can attach to this instance.
|
||||
* If the initialization fails for any reason, any dynamic library loaded will
|
||||
* be freed, and all NVTX implementation functions will be set to no-ops. If
|
||||
* initialization succeeds, NVTX functions not attached to the tool will be set
|
||||
* to no-ops. This is implemented as one function instead of several small
|
||||
* functions to minimize the number of weak symbols the linker must resolve.
|
||||
* Order of search is:
|
||||
* - Pre-injected library exporting InitializeInjectionNvtx2
|
||||
* - Loadable library exporting InitializeInjectionNvtx2
|
||||
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
|
||||
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
|
||||
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
|
||||
*/
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
|
||||
{
|
||||
const char* const initFuncName = "InitializeInjectionNvtx2";
|
||||
NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
|
||||
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
|
||||
int entryPointStatus = 0;
|
||||
|
||||
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
|
||||
/* Use POSIX global symbol chain to query for init function from any module */
|
||||
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
|
||||
#endif
|
||||
|
||||
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
|
||||
/* Try discovering dynamic injection library to load */
|
||||
if (!init_fnptr)
|
||||
{
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
|
||||
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
|
||||
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
|
||||
? NVTX_STR("NVTX_INJECTION32_PATH")
|
||||
: NVTX_STR("NVTX_INJECTION64_PATH");
|
||||
#endif /* NVTX_SUPPORT_ENV_VARS */
|
||||
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
|
||||
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
|
||||
|
||||
/* Refer to this variable explicitly in case all references to it are #if'ed out */
|
||||
(void)injectionLibraryPathBuf;
|
||||
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
|
||||
* these functions are not called again before using the returned value. */
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( push )
|
||||
#pragma warning( disable : 4996 )
|
||||
#endif
|
||||
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( pop )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
if (!injectionLibraryPath)
|
||||
{
|
||||
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
|
||||
char cmdlineBuf[32];
|
||||
char pkgName[PATH_MAX];
|
||||
int count;
|
||||
int pid;
|
||||
FILE *fp;
|
||||
size_t bytesRead;
|
||||
size_t pos;
|
||||
|
||||
pid = (int)getpid();
|
||||
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
|
||||
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
|
||||
{
|
||||
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
fp = fopen(cmdlineBuf, "r");
|
||||
if (!fp)
|
||||
{
|
||||
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
|
||||
fclose(fp);
|
||||
if (bytesRead == 0)
|
||||
{
|
||||
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
pkgName[bytesRead] = 0;
|
||||
|
||||
/* String can contain colon as a process separator. In this case the package name is before the colon. */
|
||||
pos = 0;
|
||||
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
|
||||
{
|
||||
++pos;
|
||||
}
|
||||
pkgName[pos] = 0;
|
||||
|
||||
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
|
||||
if (count <= 0 || count >= NVTX_BUFSIZE)
|
||||
{
|
||||
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
/* On Android, verify path is accessible due to aggressive file access restrictions. */
|
||||
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
|
||||
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
|
||||
if (injectionLibraryPathBuf[0] == '/')
|
||||
{
|
||||
#if (__ANDROID_API__ < 21)
|
||||
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
|
||||
#else
|
||||
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
|
||||
#endif
|
||||
if (access_err != 0)
|
||||
{
|
||||
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
}
|
||||
injectionLibraryPath = injectionLibraryPathBuf;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* At this point, injectionLibraryPath is specified if a dynamic
|
||||
* injection library was specified by a tool. */
|
||||
if (injectionLibraryPath)
|
||||
{
|
||||
/* Load the injection library */
|
||||
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
|
||||
if (!injectionLibraryHandle)
|
||||
{
|
||||
NVTX_ERR("Failed to load injection library\n");
|
||||
return NVTX_ERR_INIT_LOAD_LIBRARY;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Attempt to get the injection library's entry-point */
|
||||
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
|
||||
if (!init_fnptr)
|
||||
{
|
||||
NVTX_DLLCLOSE(injectionLibraryHandle);
|
||||
NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
|
||||
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
|
||||
if (!init_fnptr)
|
||||
{
|
||||
/* Check weakly-defined function pointer. A statically-linked injection can define this as
|
||||
* a normal symbol and it will take precedence over a dynamic injection. */
|
||||
if (InitializeInjectionNvtx2_fnptr)
|
||||
{
|
||||
init_fnptr = InitializeInjectionNvtx2_fnptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* At this point, if init_fnptr is not set, then no tool has specified
|
||||
* an NVTX injection library -- return non-success result so all NVTX
|
||||
* API functions will be set to no-ops. */
|
||||
if (!init_fnptr)
|
||||
{
|
||||
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
|
||||
}
|
||||
|
||||
/* Invoke injection library's initialization function. If it returns
|
||||
* 0 (failure) and a dynamic injection was loaded, unload it. */
|
||||
entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
|
||||
if (entryPointStatus == 0)
|
||||
{
|
||||
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
|
||||
if (injectionLibraryHandle)
|
||||
{
|
||||
NVTX_DLLCLOSE(injectionLibraryHandle);
|
||||
}
|
||||
return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
|
||||
}
|
||||
|
||||
return NVTX_SUCCESS;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
|
||||
{
|
||||
unsigned int old;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
NVTX_ATOMIC_CAS_32(
|
||||
old,
|
||||
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
|
||||
NVTX_INIT_STATE_STARTED,
|
||||
NVTX_INIT_STATE_FRESH);
|
||||
if (old == NVTX_INIT_STATE_FRESH)
|
||||
{
|
||||
int result;
|
||||
int forceAllToNoops;
|
||||
|
||||
/* Load & initialize injection library -- it will assign the function pointers */
|
||||
result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
|
||||
|
||||
/* Set all pointers not assigned by the injection to null */
|
||||
forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
|
||||
|
||||
/* Signal that initialization has finished, so now the assigned function pointers will be used */
|
||||
NVTX_ATOMIC_WRITE_32(
|
||||
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
|
||||
NVTX_INIT_STATE_COMPLETE);
|
||||
}
|
||||
else /* Spin-wait until initialization has finished */
|
||||
{
|
||||
NVTX_MEMBAR();
|
||||
while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
|
||||
{
|
||||
NVTX_YIELD();
|
||||
NVTX_MEMBAR();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
|
||||
@@ -0,0 +1,573 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxRangeEnd(id);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePop();
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameCategoryA(category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameCategoryW(category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameOsThreadA(threadId, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameOsThreadW(threadId, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainMarkEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangeStartEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainRangeEnd(domain, id);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangePushEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangePop(domain);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainResourceCreate(domain, attribs);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainResourceDestroy(resource);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainNameCategoryA(domain, category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainNameCategoryW(domain, category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRegisterStringA(domain, string);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRegisterStringW(domain, string);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainCreateA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainCreateW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainDestroy(domain);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxInitialize(reserved);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
|
||||
nvtxNameCuDeviceA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
|
||||
nvtxNameCuDeviceW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
|
||||
nvtxNameCuContextA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
|
||||
nvtxNameCuContextW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
|
||||
nvtxNameCuStreamA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
|
||||
nvtxNameCuStreamW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
|
||||
nvtxNameCuEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
|
||||
nvtxNameCuEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
|
||||
nvtxNameCudaDeviceA_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
|
||||
nvtxNameCudaDeviceW_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
|
||||
nvtxNameCudaStreamA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
|
||||
nvtxNameCudaStreamW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
|
||||
nvtxNameCudaEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
|
||||
nvtxNameCudaEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
|
||||
nvtxNameClDeviceA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
|
||||
nvtxNameClDeviceW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
|
||||
nvtxNameClContextA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
|
||||
nvtxNameClContextW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
|
||||
nvtxNameClCommandQueueA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
|
||||
if (local)
|
||||
local(command_queue, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
|
||||
nvtxNameClCommandQueueW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
|
||||
if (local)
|
||||
local(command_queue, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
|
||||
nvtxNameClMemObjectA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
|
||||
if (local)
|
||||
local(memobj, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
|
||||
nvtxNameClMemObjectW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
|
||||
if (local)
|
||||
local(memobj, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
|
||||
nvtxNameClSamplerA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
|
||||
if (local)
|
||||
local(sampler, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
|
||||
nvtxNameClSamplerW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
|
||||
if (local)
|
||||
local(sampler, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
|
||||
nvtxNameClProgramA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
|
||||
if (local)
|
||||
local(program, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
|
||||
nvtxNameClProgramW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
|
||||
if (local)
|
||||
local(program, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
|
||||
nvtxNameClEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(evnt, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
|
||||
nvtxNameClEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(evnt, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
|
||||
nvtxDomainSyncUserCreate_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
if (local) {
|
||||
return local(domain, attribs);
|
||||
}
|
||||
return (nvtxSyncUser_t)0;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserDestroy_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserReleasing_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
|
||||
{
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef __NVTX_LINKONCE_H__
|
||||
#define __NVTX_LINKONCE_H__
|
||||
|
||||
/* This header defines macros to permit making definitions of global variables
|
||||
* and functions in C/C++ header files which may be included multiple times in
|
||||
* a translation unit or linkage unit. It allows authoring header-only libraries
|
||||
* which can be used by multiple other header-only libraries (either as the same
|
||||
* copy or multiple copies), and does not require any build changes, such as
|
||||
* adding another .c file, linking a static library, or deploying a dynamic
|
||||
* library. Globals defined with these macros have the property that they have
|
||||
* the same address, pointing to a single instance, for the entire linkage unit.
|
||||
* It is expected but not guaranteed that each linkage unit will have a separate
|
||||
* instance.
|
||||
*
|
||||
* In some situations it is desirable to declare a variable without initializing
|
||||
* it, refer to it in code or other variables' initializers, and then initialize
|
||||
* it later. Similarly, functions can be prototyped, have their address taken,
|
||||
* and then have their body defined later. In such cases, use the FWDDECL macros
|
||||
* when forward-declaring LINKONCE global variables without initializers and
|
||||
* function prototypes, and then use the DEFINE macros when later defining them.
|
||||
* Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
|
||||
* following this pattern makes code maximally portable.
|
||||
*/
|
||||
|
||||
#if defined(__MINGW32__) /* MinGW */
|
||||
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
|
||||
#if defined(__cplusplus)
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
|
||||
#else
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
|
||||
#endif
|
||||
#elif defined(_MSC_VER) /* MSVC */
|
||||
#if defined(__cplusplus)
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany)
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
|
||||
#else
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION __inline
|
||||
#endif
|
||||
#elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
|
||||
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
|
||||
#if defined(__cplusplus)
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
|
||||
#else
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
|
||||
#endif
|
||||
#elif defined(__CYGWIN__) /* Assume GCC or compatible */
|
||||
#define NVTX_LINKONCE_WEAK __attribute__((weak))
|
||||
#if defined(__cplusplus)
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
|
||||
#else
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
|
||||
#endif
|
||||
#else /* All others: Assume GCC, clang, or compatible */
|
||||
#define NVTX_LINKONCE_WEAK __attribute__((weak))
|
||||
#define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
|
||||
#if defined(__cplusplus)
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
|
||||
#else
|
||||
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
|
||||
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern
|
||||
#define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
|
||||
|
||||
#endif /* __NVTX_LINKONCE_H__ */
|
||||
@@ -0,0 +1,304 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* This header defines types which are used by the internal implementation
|
||||
* of NVTX and callback subscribers. API clients do not use these types,
|
||||
* so they are defined here instead of in nvToolsExt.h to clarify they are
|
||||
* not part of the NVTX client API. */
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h.
|
||||
#endif
|
||||
|
||||
/* ------ Dependency-free types binary-compatible with real types ------- */
|
||||
|
||||
/* In order to avoid having the NVTX core API headers depend on non-NVTX
|
||||
* headers like cuda.h, NVTX defines binary-compatible types to use for
|
||||
* safely making the initialization versions of all NVTX functions without
|
||||
* needing to have definitions for the real types. */
|
||||
|
||||
typedef int nvtx_CUdevice;
|
||||
typedef void* nvtx_CUcontext;
|
||||
typedef void* nvtx_CUstream;
|
||||
typedef void* nvtx_CUevent;
|
||||
|
||||
typedef void* nvtx_cudaStream_t;
|
||||
typedef void* nvtx_cudaEvent_t;
|
||||
|
||||
typedef void* nvtx_cl_platform_id;
|
||||
typedef void* nvtx_cl_device_id;
|
||||
typedef void* nvtx_cl_context;
|
||||
typedef void* nvtx_cl_command_queue;
|
||||
typedef void* nvtx_cl_mem;
|
||||
typedef void* nvtx_cl_program;
|
||||
typedef void* nvtx_cl_kernel;
|
||||
typedef void* nvtx_cl_event;
|
||||
typedef void* nvtx_cl_sampler;
|
||||
|
||||
typedef struct nvtxSyncUser* nvtxSyncUser_t;
|
||||
struct nvtxSyncUserAttributes_v0;
|
||||
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
|
||||
|
||||
/* --------- Types for function pointers (with fake API types) ---------- */
|
||||
|
||||
typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message);
|
||||
typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message);
|
||||
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message);
|
||||
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message);
|
||||
typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id);
|
||||
typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message);
|
||||
typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message);
|
||||
typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void);
|
||||
typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name);
|
||||
|
||||
/* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name);
|
||||
|
||||
/* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */
|
||||
typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name);
|
||||
|
||||
/* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name);
|
||||
|
||||
typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
|
||||
typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain);
|
||||
typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
|
||||
typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource);
|
||||
typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
|
||||
typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
|
||||
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string);
|
||||
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string);
|
||||
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message);
|
||||
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message);
|
||||
typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain);
|
||||
typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved);
|
||||
|
||||
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
|
||||
|
||||
/* ---------------- Types for callback subscription --------------------- */
|
||||
|
||||
typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId);
|
||||
typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable);
|
||||
|
||||
typedef enum NvtxCallbackModule
|
||||
{
|
||||
NVTX_CB_MODULE_INVALID = 0,
|
||||
NVTX_CB_MODULE_CORE = 1,
|
||||
NVTX_CB_MODULE_CUDA = 2,
|
||||
NVTX_CB_MODULE_OPENCL = 3,
|
||||
NVTX_CB_MODULE_CUDART = 4,
|
||||
NVTX_CB_MODULE_CORE2 = 5,
|
||||
NVTX_CB_MODULE_SYNC = 6,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CB_MODULE_SIZE,
|
||||
NVTX_CB_MODULE_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackModule;
|
||||
|
||||
typedef enum NvtxCallbackIdCore
|
||||
{
|
||||
NVTX_CBID_CORE_INVALID = 0,
|
||||
NVTX_CBID_CORE_MarkEx = 1,
|
||||
NVTX_CBID_CORE_MarkA = 2,
|
||||
NVTX_CBID_CORE_MarkW = 3,
|
||||
NVTX_CBID_CORE_RangeStartEx = 4,
|
||||
NVTX_CBID_CORE_RangeStartA = 5,
|
||||
NVTX_CBID_CORE_RangeStartW = 6,
|
||||
NVTX_CBID_CORE_RangeEnd = 7,
|
||||
NVTX_CBID_CORE_RangePushEx = 8,
|
||||
NVTX_CBID_CORE_RangePushA = 9,
|
||||
NVTX_CBID_CORE_RangePushW = 10,
|
||||
NVTX_CBID_CORE_RangePop = 11,
|
||||
NVTX_CBID_CORE_NameCategoryA = 12,
|
||||
NVTX_CBID_CORE_NameCategoryW = 13,
|
||||
NVTX_CBID_CORE_NameOsThreadA = 14,
|
||||
NVTX_CBID_CORE_NameOsThreadW = 15,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_CORE_SIZE,
|
||||
NVTX_CBID_CORE_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdCore;
|
||||
|
||||
typedef enum NvtxCallbackIdCore2
|
||||
{
|
||||
NVTX_CBID_CORE2_INVALID = 0,
|
||||
NVTX_CBID_CORE2_DomainMarkEx = 1,
|
||||
NVTX_CBID_CORE2_DomainRangeStartEx = 2,
|
||||
NVTX_CBID_CORE2_DomainRangeEnd = 3,
|
||||
NVTX_CBID_CORE2_DomainRangePushEx = 4,
|
||||
NVTX_CBID_CORE2_DomainRangePop = 5,
|
||||
NVTX_CBID_CORE2_DomainResourceCreate = 6,
|
||||
NVTX_CBID_CORE2_DomainResourceDestroy = 7,
|
||||
NVTX_CBID_CORE2_DomainNameCategoryA = 8,
|
||||
NVTX_CBID_CORE2_DomainNameCategoryW = 9,
|
||||
NVTX_CBID_CORE2_DomainRegisterStringA = 10,
|
||||
NVTX_CBID_CORE2_DomainRegisterStringW = 11,
|
||||
NVTX_CBID_CORE2_DomainCreateA = 12,
|
||||
NVTX_CBID_CORE2_DomainCreateW = 13,
|
||||
NVTX_CBID_CORE2_DomainDestroy = 14,
|
||||
NVTX_CBID_CORE2_Initialize = 15,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_CORE2_SIZE,
|
||||
NVTX_CBID_CORE2_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdCore2;
|
||||
|
||||
typedef enum NvtxCallbackIdCuda
|
||||
{
|
||||
NVTX_CBID_CUDA_INVALID = 0,
|
||||
NVTX_CBID_CUDA_NameCuDeviceA = 1,
|
||||
NVTX_CBID_CUDA_NameCuDeviceW = 2,
|
||||
NVTX_CBID_CUDA_NameCuContextA = 3,
|
||||
NVTX_CBID_CUDA_NameCuContextW = 4,
|
||||
NVTX_CBID_CUDA_NameCuStreamA = 5,
|
||||
NVTX_CBID_CUDA_NameCuStreamW = 6,
|
||||
NVTX_CBID_CUDA_NameCuEventA = 7,
|
||||
NVTX_CBID_CUDA_NameCuEventW = 8,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_CUDA_SIZE,
|
||||
NVTX_CBID_CUDA_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdCuda;
|
||||
|
||||
typedef enum NvtxCallbackIdCudaRt
|
||||
{
|
||||
NVTX_CBID_CUDART_INVALID = 0,
|
||||
NVTX_CBID_CUDART_NameCudaDeviceA = 1,
|
||||
NVTX_CBID_CUDART_NameCudaDeviceW = 2,
|
||||
NVTX_CBID_CUDART_NameCudaStreamA = 3,
|
||||
NVTX_CBID_CUDART_NameCudaStreamW = 4,
|
||||
NVTX_CBID_CUDART_NameCudaEventA = 5,
|
||||
NVTX_CBID_CUDART_NameCudaEventW = 6,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_CUDART_SIZE,
|
||||
NVTX_CBID_CUDART_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdCudaRt;
|
||||
|
||||
typedef enum NvtxCallbackIdOpenCL
|
||||
{
|
||||
NVTX_CBID_OPENCL_INVALID = 0,
|
||||
NVTX_CBID_OPENCL_NameClDeviceA = 1,
|
||||
NVTX_CBID_OPENCL_NameClDeviceW = 2,
|
||||
NVTX_CBID_OPENCL_NameClContextA = 3,
|
||||
NVTX_CBID_OPENCL_NameClContextW = 4,
|
||||
NVTX_CBID_OPENCL_NameClCommandQueueA = 5,
|
||||
NVTX_CBID_OPENCL_NameClCommandQueueW = 6,
|
||||
NVTX_CBID_OPENCL_NameClMemObjectA = 7,
|
||||
NVTX_CBID_OPENCL_NameClMemObjectW = 8,
|
||||
NVTX_CBID_OPENCL_NameClSamplerA = 9,
|
||||
NVTX_CBID_OPENCL_NameClSamplerW = 10,
|
||||
NVTX_CBID_OPENCL_NameClProgramA = 11,
|
||||
NVTX_CBID_OPENCL_NameClProgramW = 12,
|
||||
NVTX_CBID_OPENCL_NameClEventA = 13,
|
||||
NVTX_CBID_OPENCL_NameClEventW = 14,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_OPENCL_SIZE,
|
||||
NVTX_CBID_OPENCL_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdOpenCL;
|
||||
|
||||
typedef enum NvtxCallbackIdSync
|
||||
{
|
||||
NVTX_CBID_SYNC_INVALID = 0,
|
||||
NVTX_CBID_SYNC_DomainSyncUserCreate = 1,
|
||||
NVTX_CBID_SYNC_DomainSyncUserDestroy = 2,
|
||||
NVTX_CBID_SYNC_DomainSyncUserAcquireStart = 3,
|
||||
NVTX_CBID_SYNC_DomainSyncUserAcquireFailed = 4,
|
||||
NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5,
|
||||
NVTX_CBID_SYNC_DomainSyncUserReleasing = 6,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_CBID_SYNC_SIZE,
|
||||
NVTX_CBID_SYNC_FORCE_INT = 0x7fffffff
|
||||
} NvtxCallbackIdSync;
|
||||
|
||||
/* IDs for NVTX Export Tables */
|
||||
typedef enum NvtxExportTableID
|
||||
{
|
||||
NVTX_ETID_INVALID = 0,
|
||||
NVTX_ETID_CALLBACKS = 1,
|
||||
NVTX_ETID_RESERVED0 = 2,
|
||||
NVTX_ETID_VERSIONINFO = 3,
|
||||
/* --- New constants must only be added directly above this line --- */
|
||||
NVTX_ETID_SIZE,
|
||||
NVTX_ETID_FORCE_INT = 0x7fffffff
|
||||
} NvtxExportTableID;
|
||||
|
||||
typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */
|
||||
typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */
|
||||
|
||||
typedef struct NvtxExportTableCallbacks
|
||||
{
|
||||
size_t struct_size;
|
||||
|
||||
/* returns an array of pointer to function pointers*/
|
||||
int (NVTX_API *GetModuleFunctionTable)(
|
||||
NvtxCallbackModule module,
|
||||
NvtxFunctionTable* out_table,
|
||||
unsigned int* out_size);
|
||||
} NvtxExportTableCallbacks;
|
||||
|
||||
typedef struct NvtxExportTableVersionInfo
|
||||
{
|
||||
/* sizeof(NvtxExportTableVersionInfo) */
|
||||
size_t struct_size;
|
||||
|
||||
/* The API version comes from the NVTX library linked to the app. The
|
||||
* injection library is can use this info to make some assumptions */
|
||||
uint32_t version;
|
||||
|
||||
/* Reserved for alignment, do not use */
|
||||
uint32_t reserved0;
|
||||
|
||||
/* This must be set by tools when attaching to provide applications
|
||||
* the ability to, in emergency situations, detect problematic tools
|
||||
* versions and modify the NVTX source to prevent attaching anything
|
||||
* that causes trouble in the app. Currently, this value is ignored. */
|
||||
void (NVTX_API *SetInjectionNvtxVersion)(
|
||||
uint32_t version);
|
||||
} NvtxExportTableVersionInfo;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Executable
+15
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVTX_STUB_H_
|
||||
#define NCCL_NVTX_STUB_H_
|
||||
|
||||
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||
|
||||
#define NVTX3_FUNC_RANGE_IN(domain)
|
||||
#define nvtxNameOsThreadA(syscall, thread)
|
||||
|
||||
#endif
|
||||
@@ -10,23 +10,34 @@
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
struct ncclP2Pinfo {
|
||||
const void* sendbuff;
|
||||
void* recvbuff;
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
};
|
||||
|
||||
struct ncclP2PConnect {
|
||||
int nrecv[MAXCHANNELS];
|
||||
int nsend[MAXCHANNELS];
|
||||
int* recv;
|
||||
int* send;
|
||||
void* buff;
|
||||
ssize_t nbytes;
|
||||
struct ncclP2Pinfo* next;
|
||||
};
|
||||
|
||||
struct ncclP2Plist {
|
||||
struct ncclP2Pinfo *peerlist;
|
||||
int count;
|
||||
struct ncclP2PConnect connect;
|
||||
struct ncclP2Pinfo *head;
|
||||
struct ncclP2Pinfo *tail;
|
||||
};
|
||||
|
||||
static ncclResult_t enqueueP2pInfo(ncclP2Plist* p2p, void* buff, ssize_t nBytes) {
|
||||
if (p2p == NULL) return ncclInternalError;
|
||||
struct ncclP2Pinfo* next;
|
||||
NCCLCHECK(ncclCalloc(&next, 1));
|
||||
next->buff = buff;
|
||||
next->nbytes = nBytes;
|
||||
if (p2p->tail != NULL) p2p->tail->next = next;
|
||||
p2p->tail = next;
|
||||
if (p2p->head == NULL) p2p->head = next;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t dequeueP2pInfo(ncclP2Plist* p2p) {
|
||||
if (p2p == NULL) return ncclInternalError;
|
||||
struct ncclP2Pinfo* temp = p2p->head;
|
||||
p2p->head = p2p->head->next;
|
||||
if (p2p->tail == temp) p2p->tail = NULL;
|
||||
free(temp);
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -31,10 +31,11 @@ static void setEnvFile(const char* fileName) {
|
||||
int s=0; // Env Var Size
|
||||
while (line[s] != '\0' && line[s] != '=') s++;
|
||||
if (line[s] == '\0') continue;
|
||||
strncpy(envVar, line, std::min(1024,s));
|
||||
strncpy(envVar, line, std::min(1023,s));
|
||||
envVar[s] = '\0';
|
||||
s++;
|
||||
strncpy(envValue, line+s, 1024);
|
||||
strncpy(envValue, line+s, 1023);
|
||||
envValue[1023]='\0';
|
||||
setenv(envVar, envValue, 0);
|
||||
}
|
||||
if (line) free(line);
|
||||
|
||||
@@ -18,18 +18,23 @@ struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
size_t sendbytes;
|
||||
size_t recvbytes;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
int segment; // Only for profiling
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
uint64_t head;
|
||||
uint64_t tail;
|
||||
uint64_t posted;
|
||||
uint64_t received; // Only used by recv proxy to wait for flush.
|
||||
uint64_t transmitted;
|
||||
uint64_t done;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
int idle;
|
||||
@@ -38,14 +43,30 @@ struct ncclProxyArgs {
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
struct ncclProxyArgs* nextGroup;
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
};
|
||||
|
||||
struct ncclProxySharedBuffers {
|
||||
int nslots;
|
||||
int slotSize;
|
||||
char* cudaBuff[2*MAXCHANNELS];
|
||||
int* cudaUsed[2*MAXCHANNELS];
|
||||
char* hostBuff[2*MAXCHANNELS];
|
||||
int* hostUsed[2*MAXCHANNELS];
|
||||
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
struct ncclProxyState {
|
||||
pthread_cond_t cond;
|
||||
pthread_mutex_t mutex;
|
||||
pthread_mutex_t opsMutex;
|
||||
pthread_mutex_t poolMutex;
|
||||
bool stop;
|
||||
struct ncclProxySharedBuffers* sharedBuffs;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* nextOps;
|
||||
struct ncclProxyArgs* nextOpsEnd;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
@@ -59,12 +80,16 @@ enum proxyMode {
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
|
||||
ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
|
||||
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
// Spin wait until func evaluates to true
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -21,6 +21,7 @@
|
||||
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
|
||||
|
||||
/* Common socket address storage structure for IPv4/IPv6 */
|
||||
union socketAddress {
|
||||
@@ -64,7 +65,7 @@ static inline int envSocketFamily(void) {
|
||||
|
||||
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
#endif
|
||||
struct netIf userIfs[MAX_IFS];
|
||||
bool searchNot = prefixList && prefixList[0] == '^';
|
||||
@@ -167,9 +168,9 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
|
||||
|
||||
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
#endif
|
||||
char line_a[1024];
|
||||
char line_a[SOCKET_NAME_MAXLEN+1];
|
||||
int found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
@@ -355,7 +356,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
|
||||
SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
|
||||
#endif
|
||||
|
||||
@@ -370,6 +371,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
|
||||
static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
||||
/* IPv4/IPv6 support */
|
||||
int family = remoteAddr->sa.sa_family;
|
||||
if (family != AF_INET && family != AF_INET6) {
|
||||
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
|
||||
|
||||
/* Connect to a hostname / port */
|
||||
@@ -386,10 +391,8 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
||||
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
|
||||
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
|
||||
|
||||
char line[1024];
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
|
||||
#endif
|
||||
|
||||
int ret;
|
||||
int timedout_retries = 0;
|
||||
@@ -450,7 +453,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t socketReceive(int fd, void* ptr, int size) {
|
||||
static ncclResult_t socketRecv(int fd, void* ptr, int size) {
|
||||
int offset = 0;
|
||||
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, ptr, size, &offset));
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -41,8 +41,8 @@ struct ncclConnect {
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
|
||||
ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
|
||||
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
};
|
||||
@@ -54,6 +54,7 @@ struct ncclTransport {
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -7,7 +7,7 @@
|
||||
#ifndef NCCL_TREES_H_
|
||||
#define NCCL_TREES_H_
|
||||
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
|
||||
|
||||
#endif
|
||||
|
||||
+169
-153
@@ -41,7 +41,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||
#endif
|
||||
|
||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "Gather", "Scatter", "AllToAll", "AllToAllv" };
|
||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
|
||||
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
||||
|
||||
@@ -160,47 +160,67 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
void *ncclCommThreadMain(void *arg) {
|
||||
ncclComm_t comm = (ncclComm_t)arg;
|
||||
int head = comm->hostDevComm.collTraceHead;
|
||||
do {
|
||||
int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
|
||||
int head = comm->hostDevComm.collTraceHead;
|
||||
int count;
|
||||
if (head <= tail)
|
||||
count = tail - head;
|
||||
else
|
||||
count = COLLTRACE_NUM_ITEMS + head - tail;
|
||||
usleep(1000); //sleep 1ms
|
||||
if (!count) {
|
||||
if(LOAD(&comm->hostDevComm.collTraceExit))
|
||||
break;
|
||||
else {
|
||||
usleep(1000); //sleep 1ms
|
||||
continue;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
uint8_t type = LOAD(&(comm->hostDevComm.collTrace[head].type));
|
||||
if (type == ncclCollTraceNotReady)
|
||||
break;
|
||||
char line[1024];
|
||||
int offset = 0;
|
||||
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
|
||||
sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
|
||||
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
|
||||
offset = strlen(line);
|
||||
switch (comm->hostDevComm.collTrace[head].type) {
|
||||
case ncclCollTraceKernelLaunchType:
|
||||
sprintf(line+offset, " KL hwid %8x funcIndex %d",
|
||||
comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
|
||||
break;
|
||||
case ncclCollTraceCollEndType:
|
||||
if (comm->hostDevComm.collTrace[head].funcIndex != -1)
|
||||
sprintf(line+offset, " CE next funcIndex %d",
|
||||
comm->hostDevComm.collTrace[head].funcIndex);
|
||||
else
|
||||
sprintf(line+offset, " KE");
|
||||
break;
|
||||
case ncclCollTraceAbortType:
|
||||
sprintf(line+offset, " Abort");
|
||||
break;
|
||||
default:
|
||||
sprintf(line+offset, " unknown collective trace data type");
|
||||
break;
|
||||
if (type == ncclCollTraceDataType) {
|
||||
sprintf(line, "## [%12.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid,
|
||||
comm->hostDevComm.collTrace[head].funcIndex,
|
||||
comm->hostDevComm.collTrace[head].data_0,
|
||||
comm->hostDevComm.collTrace[head].opCount,
|
||||
comm->hostDevComm.collTrace[head].data_1);
|
||||
} else {
|
||||
sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
|
||||
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
|
||||
offset = strlen(line);
|
||||
switch (type) {
|
||||
case ncclCollTraceKernelLaunchType:
|
||||
sprintf(line+offset, " KL hwid %8x funcIndex %d",
|
||||
comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
|
||||
break;
|
||||
case ncclCollTraceCollEndType:
|
||||
if (comm->hostDevComm.collTrace[head].funcIndex != -1)
|
||||
sprintf(line+offset, " CE next funcIndex %d",
|
||||
comm->hostDevComm.collTrace[head].funcIndex);
|
||||
else
|
||||
sprintf(line+offset, " KE");
|
||||
break;
|
||||
case ncclCollTraceAbortType:
|
||||
sprintf(line+offset, " Abort");
|
||||
break;
|
||||
default:
|
||||
sprintf(line+offset, " unknown collective trace data type");
|
||||
break;
|
||||
}
|
||||
}
|
||||
INFO(NCCL_COLL, "%s", line);
|
||||
STORE(&(comm->hostDevComm.collTrace[head].type), ncclCollTraceNotReady);
|
||||
head ++;
|
||||
head %= COLLTRACE_NUM_ITEMS;
|
||||
}
|
||||
comm->hostDevComm.collTraceHead = tail;
|
||||
} while(!LOAD(&comm->hostDevComm.collTraceExit));
|
||||
} while(1);
|
||||
comm->hostDevComm.collTraceHead = head;
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
@@ -210,9 +230,11 @@ void *ncclCommThreadMain(void *arg) {
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
free(comm->p2plist.peerlist);
|
||||
free(comm->p2plist.connect.recv);
|
||||
free(comm->p2plist.connect.send);
|
||||
free(comm->connectSend);
|
||||
free(comm->connectRecv);
|
||||
free(comm->p2pSends);
|
||||
free(comm->p2pRecvs);
|
||||
free(comm->asyncOps);
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
|
||||
@@ -290,7 +312,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->intraCGMode);
|
||||
free(comm->intraCC);
|
||||
}
|
||||
CUDACHECK(hipHostFree((void *)comm->abortFlag));
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
|
||||
|
||||
// Poison comm to try and catch a double free
|
||||
commPoison(comm);
|
||||
@@ -319,7 +341,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
struct ncclComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
|
||||
comm->rank = comm->hostDevComm.rank =rank;
|
||||
comm->rank = comm->hostDevComm.rank = rank;
|
||||
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||
hipGetDevice(&comm->cudaDev);
|
||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||
@@ -355,17 +377,25 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->hostDevComm.collTraceThread = 0;
|
||||
#endif
|
||||
comm->collNetSupport = 0;
|
||||
comm->p2plist.count=0;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
|
||||
for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
|
||||
|
||||
NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
|
||||
|
||||
comm->p2pSendCount = comm->p2pRecvCount = 0;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
|
||||
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
|
||||
|
||||
comm->alltoallDisable = false;
|
||||
if (rcclParamAllToAllDisable()) comm->alltoallDisable = true;
|
||||
comm->alltoallDisable = true;
|
||||
//if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;
|
||||
|
||||
*comret = comm;
|
||||
return ncclSuccess;
|
||||
@@ -373,11 +403,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Duplicate the channels on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
|
||||
|
||||
// Copy userRanks and peers
|
||||
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
|
||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
}
|
||||
|
||||
@@ -449,7 +479,7 @@ void* waitForNonNullPtr(void* p) {
|
||||
|
||||
ncclResult_t initParams(struct ncclComm* comm) {
|
||||
hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
|
||||
params->args =(void **)&comm->argsptr;
|
||||
params->args = (void **)&comm->argsptr;
|
||||
params->stream = NULL;
|
||||
params->sharedMem = 0;
|
||||
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
|
||||
@@ -518,8 +548,8 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
|
||||
|
||||
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
|
||||
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
|
||||
#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
|
||||
#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
|
||||
#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
|
||||
#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
|
||||
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
|
||||
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
|
||||
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
|
||||
@@ -532,10 +562,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
|
||||
|
||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
|
||||
if (comm->nRanks >= 32) {
|
||||
defaults[NCCL_PROTO_SIMPLE] = 524288;
|
||||
INFO(NCCL_INIT, "Setting DEFAULT_BUFFSIZE to %d for nRanks %d", defaults[NCCL_PROTO_SIMPLE], comm->nRanks);
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
@@ -581,7 +607,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
|
||||
// setup
|
||||
struct ncclConnect myConnect;
|
||||
if (isMaster && ret > 0) {
|
||||
NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
|
||||
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
|
||||
}
|
||||
// prepare connect handles
|
||||
ncclResult_t res;
|
||||
@@ -611,7 +637,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
|
||||
}
|
||||
// connect
|
||||
if (isMaster && ret > 0) {
|
||||
NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
struct ncclPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
|
||||
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
|
||||
@@ -669,10 +695,9 @@ NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
|
||||
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
||||
// We use 3 AllGathers
|
||||
// 1. { peerInfo, comm }
|
||||
// 2. ConnectTransport[nranks], ConnectValue[nranks]
|
||||
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
|
||||
// We use 2 AllGathers
|
||||
// 1. { peerInfo, comm, compCap}
|
||||
// 2. { nChannels, graphInfo, topoRanks }
|
||||
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
@@ -684,10 +709,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
struct {
|
||||
struct ncclPeerInfo peerInfo;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
} *allGather1Data;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
|
||||
allGather1Data[rank].comm = comm;
|
||||
allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
|
||||
struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
|
||||
NCCLCHECK(fillInfo(comm, myInfo, commHash));
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
|
||||
@@ -700,7 +727,40 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
// AllGather1 data is used again below
|
||||
|
||||
// Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
|
||||
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
|
||||
int myCompCap = allGather1Data[rank].cudaCompCap;
|
||||
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
||||
uint64_t otherHostHash;
|
||||
int tmpNnodes = 1;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
|
||||
if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
|
||||
if (intraRanks == 0) intraRank0 = i;
|
||||
if (i == rank) intraRank = intraRanks;
|
||||
intraRanks++;
|
||||
}
|
||||
} else { // Determine whether number of nodes is 2 (for use in tree pattern determination)
|
||||
if (tmpNnodes == 1) {
|
||||
otherHostHash = allGather1Data[i].peerInfo.hostHash;
|
||||
tmpNnodes = 2;
|
||||
} else if (tmpNnodes == 2 && otherHostHash != allGather1Data[i].peerInfo.hostHash) {
|
||||
tmpNnodes = 3;
|
||||
}
|
||||
}
|
||||
minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
|
||||
maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
|
||||
}
|
||||
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
||||
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
|
||||
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm;
|
||||
|
||||
// AllGather1 - end
|
||||
|
||||
// Topo detection / System graph creation
|
||||
@@ -729,7 +789,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
struct ncclTopoGraph treeGraph;
|
||||
treeGraph.id = 1;
|
||||
treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
treeGraph.pattern = tmpNnodes <= 2 ? NCCL_TOPO_PATTERN_TREE : NCCL_TOPO_PATTERN_BALANCED_TREE;
|
||||
treeGraph.crossNic = ncclParamCrossNic();
|
||||
treeGraph.collNet = 0;
|
||||
treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
|
||||
@@ -753,10 +813,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
// AllGather3 - begin
|
||||
struct ncclGraphInfo {
|
||||
int pattern;
|
||||
int sameChannels;
|
||||
float speedIntra;
|
||||
float speedInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
};
|
||||
|
||||
struct {
|
||||
@@ -776,29 +838,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
|
||||
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
|
||||
allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
|
||||
|
||||
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
|
||||
std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
|
||||
allGather3Data[rank].tree.pattern = treeGraph.pattern;
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
|
||||
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
|
||||
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
|
||||
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
|
||||
allGather3Data[rank].ring.pattern = ringGraph.pattern;
|
||||
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
|
||||
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
|
||||
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
|
||||
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
|
||||
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
|
||||
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
|
||||
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
|
||||
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
|
||||
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
|
||||
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
|
||||
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
|
||||
|
||||
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
|
||||
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
|
||||
|
||||
// Determine nNodes, firstRanks, ...
|
||||
int* nodesFirstRank;
|
||||
int *nodesFirstRank, *nodesTreePatterns;
|
||||
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
|
||||
NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
int node = -1;
|
||||
int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
|
||||
@@ -808,18 +878,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
if (node == -1) {
|
||||
node = comm->nNodes++;
|
||||
nodesFirstRank[node] = firstRank;
|
||||
// Record tree pattern of each node as they can be different depending on sm arch
|
||||
nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
|
||||
}
|
||||
if (i == comm->rank) comm->node = node;
|
||||
}
|
||||
|
||||
// Determine the minimum CUDA Compute capability of all GPUs
|
||||
int myCompCap = allGather3Data[rank].cudaCompCap;
|
||||
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
|
||||
maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
|
||||
}
|
||||
|
||||
int nChannelsOrig = comm->nChannels;
|
||||
struct ncclTopoRanks** allTopoRanks;
|
||||
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
|
||||
@@ -835,15 +899,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
|
||||
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
|
||||
treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
|
||||
treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
|
||||
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
|
||||
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
|
||||
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
|
||||
ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
|
||||
ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
|
||||
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
|
||||
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
|
||||
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
|
||||
collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
|
||||
collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
|
||||
}
|
||||
|
||||
if (comm->alltoallDisable != alltoallDisable) {
|
||||
comm->alltoallDisable = alltoallDisable;
|
||||
}
|
||||
@@ -873,7 +941,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, gcn, nNets));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
@@ -881,23 +949,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
}
|
||||
|
||||
free(allTopoRanks);
|
||||
free(nodesTreePatterns);
|
||||
free(nodesFirstRank);
|
||||
free(allGather1Data);
|
||||
free(allGather3Data);
|
||||
|
||||
// AllGather3 - end
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||
|
||||
char line[1024];
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* treeUp = &comm->channels[c].treeUp;
|
||||
struct ncclTree* treeDn = &comm->channels[c].treeDn;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
|
||||
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
|
||||
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
|
||||
struct ncclTree* tree = &comm->channels[c].tree;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
@@ -913,16 +979,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(computeBuffSizes(comm));
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
struct ncclConnect *connect;
|
||||
NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
|
||||
INFO(NCCL_INIT, "Connected all rings");
|
||||
|
||||
// Connect Trees
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
|
||||
INFO(NCCL_INIT, "Connected all trees");
|
||||
|
||||
// Check if we can setup CollNet
|
||||
if (comm->nNodes > 1 &&
|
||||
@@ -935,8 +1009,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
for (int c=0; c<logicChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
|
||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
|
||||
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
|
||||
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
|
||||
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
|
||||
if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
|
||||
@@ -944,82 +1018,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
|
||||
collNetSetupFail = 1;
|
||||
}
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
|
||||
// Verify CollNet setup across ranks
|
||||
NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
|
||||
}
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
||||
free(connect);
|
||||
free(rings);
|
||||
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||
|
||||
// Compute nChannels per peer for p2p
|
||||
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
||||
|
||||
if (!alltoallDisable) {
|
||||
int nc = comm->nChannels;
|
||||
if (comm->topo->type == RCCL_TOPO_4P2H_ROME)
|
||||
nc = 2;
|
||||
for (int c=0; c<nc; c++) {
|
||||
const int peersPerChan = DIVUP(nranks, nc);
|
||||
struct ncclP2PConnect* connect = &comm->p2plist.connect;
|
||||
connect->nrecv[c] = 0;
|
||||
connect->nsend[c] = 0;
|
||||
for (int p=0; p<peersPerChan; p++) {
|
||||
// first channel is reserved for self copy
|
||||
if ((c*peersPerChan+p)%nranks == 0)
|
||||
continue;
|
||||
int peerSend = (rank+c*peersPerChan+p)%nranks;
|
||||
int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
|
||||
if (comm->channels[c].peers[peerSend].send.connected == 0) {
|
||||
connect->send[c*nranks+connect->nsend[c]++] = peerSend;
|
||||
}
|
||||
if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
|
||||
connect->recv[c*nranks+connect->nrecv[c]++] = peerRecv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c<nc; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
struct ncclP2PConnect* connect = &comm->p2plist.connect;
|
||||
#if 0
|
||||
printf("channel %d recv: ", c);
|
||||
for (int i=0; i<connect->nrecv[c]; i++)
|
||||
printf("%d ", connect->recv[c*nranks+i]);
|
||||
printf("\n");
|
||||
printf("channel %d send: ", c);
|
||||
for (int i=0; i<connect->nsend[c]; i++)
|
||||
printf("%d ", connect->send[c*nranks+i]);
|
||||
printf("\n");
|
||||
#endif
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*nranks, connect->nsend[c], connect->send+c*nranks));
|
||||
connect->nrecv[c] = 0;
|
||||
connect->nsend[c] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute intra ranks (using AllGather1 data)
|
||||
do {
|
||||
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
|
||||
(allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
|
||||
if (intraRanks == 0) intraRank0 = i;
|
||||
if (i == rank) intraRank = intraRanks;
|
||||
intraRanks++;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
||||
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
|
||||
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
|
||||
} while(0);
|
||||
|
||||
// Done with AllGather1 data
|
||||
free(allGather1Data);
|
||||
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
|
||||
|
||||
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
|
||||
|
||||
@@ -1083,6 +1095,7 @@ end:
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
|
||||
@@ -1091,6 +1104,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
||||
if (ndev < 0) {
|
||||
WARN("Invalid device count requested : %d", ndev);
|
||||
@@ -1110,9 +1124,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
|
||||
static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
int savedDevice;
|
||||
#ifdef ENABLE_TRACE
|
||||
int rank = comm->rank;
|
||||
#endif
|
||||
CUDACHECK(hipGetDevice(&savedDevice));
|
||||
int commDevice = comm->cudaDev;
|
||||
|
||||
@@ -1120,7 +1131,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
CUDACHECK(hipSetDevice(commDevice));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, LOAD(comm->abortFlag), comm->fatalError);
|
||||
|
||||
CUDACHECK(hipStreamSynchronize(comm->groupStream));
|
||||
NCCLCHECK(ncclProxyDestroy(comm));
|
||||
@@ -1129,13 +1140,14 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
if (savedDevice != commDevice)
|
||||
CUDACHECK(hipSetDevice(savedDevice));
|
||||
|
||||
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
||||
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1152,6 +1164,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1186,6 +1199,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
||||
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
|
||||
*count = comm->nRanks;
|
||||
@@ -1194,6 +1208,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
||||
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
|
||||
*devid = comm->cudaDev;
|
||||
@@ -1202,6 +1217,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
||||
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
|
||||
*rank = comm->rank;
|
||||
|
||||
@@ -46,26 +46,19 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
}
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast
|
||||
|| info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll) {
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
|
||||
info->count = info->nBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter
|
||||
|| info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll)
|
||||
info->nBytes *= info->comm->nRanks; // count is per rank
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
// Use count to store data type size for alltoallv
|
||||
info->count = ncclTypeSize(info->datatype);
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
|
||||
|
||||
if (info->op < 0 || info->op >= ncclNumOps) {
|
||||
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
if (info->comm->checkPointers) {
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
if (info->coll == ncclFuncSendRecv) {
|
||||
if (strcmp(info->opName, "Send") == 0) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
|
||||
} else {
|
||||
@@ -73,10 +66,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
}
|
||||
} else {
|
||||
// Check CUDA device pointers
|
||||
if ((info->coll != ncclCollBroadcast && info->coll != ncclCollScatter) || info->comm->rank == info->root) {
|
||||
if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
|
||||
}
|
||||
if ((info->coll != ncclCollReduce && info->coll != ncclCollGather) || info->comm->rank == info->root) {
|
||||
if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -16,14 +16,11 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
|
||||
static nvmlReturn_t (*nvmlInternalShutdown)(void);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
|
||||
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
|
||||
|
||||
// Used to make the NVML library calls thread safe
|
||||
@@ -74,10 +71,7 @@ ncclResult_t wrapNvmlSymbols(void) {
|
||||
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
|
||||
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
|
||||
LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
|
||||
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
|
||||
@@ -91,9 +85,6 @@ teardown:
|
||||
nvmlInternalShutdown = NULL;
|
||||
nvmlInternalDeviceGetHandleByPciBusId = NULL;
|
||||
nvmlInternalDeviceGetIndex = NULL;
|
||||
nvmlInternalDeviceGetHandleByIndex = NULL;
|
||||
nvmlInternalDeviceGetPciInfo = NULL;
|
||||
nvmlInternalDeviceGetMinorNumber = NULL;
|
||||
nvmlInternalDeviceGetNvLinkState = NULL;
|
||||
nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
|
||||
nvmlInternalDeviceGetNvLinkCapability = NULL;
|
||||
@@ -162,51 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
|
||||
if (nvmlInternalDeviceGetHandleByIndex == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
|
||||
if (nvmlInternalDeviceGetPciInfo == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetPciInfo() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
|
||||
if (nvmlInternalDeviceGetMinorNumber == NULL) {
|
||||
WARN("lib wrapper not initialized.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nvmlReturn_t ret;
|
||||
NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
WARN("nvmlDeviceGetMinorNumber() failed: %s ",
|
||||
nvmlInternalErrorString(ret));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
|
||||
if (nvmlInternalDeviceGetNvLinkState == NULL) {
|
||||
/* Do not warn, this symbol is optional. */
|
||||
|
||||
+376
-162
@@ -6,10 +6,10 @@
|
||||
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "graph.h"
|
||||
#include "collectives.h"
|
||||
|
||||
#define RECV 0
|
||||
#define SEND 1
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
|
||||
@@ -19,15 +19,13 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
|
||||
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
|
||||
int index = pattern == ncclPatternPipelineFrom ?
|
||||
/* no recv / no send if root = */
|
||||
/* bcast */ (type == RECV ? myrank : nextrank ):
|
||||
/* reduce */ (type == RECV ? prevrank : myrank );
|
||||
/* bcast */ (type == proxyRecv ? myrank : nextrank ):
|
||||
/* reduce */ (type == proxyRecv ? prevrank : myrank );
|
||||
int rank = ring->userRanks[index];
|
||||
return (root != rank);
|
||||
}
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
#define PROXYARGS_ALLOCATE_SIZE 32
|
||||
#define PROXYARGS_ALLOCATE_SIZE 128
|
||||
struct ncclProxyPool {
|
||||
struct ncclProxyPool *next;
|
||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||
@@ -36,7 +34,7 @@ struct ncclProxyPool {
|
||||
static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* elem;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
pthread_mutex_lock(&state->poolMutex);
|
||||
if (state->pool == NULL) {
|
||||
// Allocate a new pool of elements
|
||||
struct ncclProxyPool* newPool;
|
||||
@@ -54,39 +52,113 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
|
||||
}
|
||||
elem = state->pool;
|
||||
state->pool = state->pool->next;
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
elem->next = elem->nextPeer = NULL;
|
||||
pthread_mutex_unlock(&state->poolMutex);
|
||||
elem->next = elem->nextPeer = elem->nextGroup = NULL;
|
||||
*argsptr = elem;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
|
||||
struct ncclComm* comm = connector->comm;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (connector->proxyAppend == NULL) {
|
||||
// Nothing running for that peer. Add to the circular list
|
||||
if (state->ops == NULL) {
|
||||
// Create the list
|
||||
args->next = args;
|
||||
state->ops = args;
|
||||
} else {
|
||||
// Insert element in the list
|
||||
args->next = state->ops->next;
|
||||
state->ops->next = args;
|
||||
//#define DEBUG_PROXY 1
|
||||
#ifdef DEBUG_PROXY
|
||||
#define DEBUG_PROXY_PRINT printf
|
||||
#else
|
||||
#define DEBUG_PROXY_PRINT(...)
|
||||
#endif
|
||||
|
||||
#define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
|
||||
#define OP_SEEN 0x100000
|
||||
ncclResult_t dumpProxyState(struct ncclProxyState* state) {
|
||||
#ifdef DEBUG_PROXY
|
||||
struct ncclProxyArgs* op = state->ops;
|
||||
while (op) {
|
||||
if (op->idle & OP_SEEN) {
|
||||
WARN("Active list loop at element %ld\n", OP_INDEX(op));
|
||||
}
|
||||
op->idle |= OP_SEEN;
|
||||
printf("[%ld]", OP_INDEX(op));
|
||||
if (op->nextPeer) {
|
||||
printf("(%ld)", OP_INDEX(op->nextPeer));
|
||||
struct ncclProxyArgs* n = op->nextPeer;
|
||||
n->idle |= OP_SEEN;
|
||||
while (n->nextGroup || n->nextPeer) {
|
||||
n = n->nextGroup ? n->nextGroup : n->nextPeer;
|
||||
n->idle |= OP_SEEN;
|
||||
}
|
||||
}
|
||||
if (op->nextGroup) {
|
||||
printf("--G->");
|
||||
op = op->nextGroup;
|
||||
} else {
|
||||
printf("--N->");
|
||||
op = op->next;
|
||||
}
|
||||
connector->proxyAppend = args;
|
||||
} else {
|
||||
// There is an active operation already for that peer.
|
||||
// Add it to the per-peer list
|
||||
connector->proxyAppend->nextPeer = args;
|
||||
connector->proxyAppend = args;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
printf("[X]\n");
|
||||
|
||||
struct ncclProxyArgs* free = state->pool;
|
||||
while (free) {
|
||||
if (free->idle & OP_SEEN) {
|
||||
WARN("Free list loop at element %ld\n", OP_INDEX(free));
|
||||
}
|
||||
free->idle |= OP_SEEN;
|
||||
free = free->next;
|
||||
}
|
||||
|
||||
struct ncclProxyPool* p = state->pools;
|
||||
int i = 0;
|
||||
while (p) {
|
||||
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
|
||||
if ((p->elems[e].idle & OP_SEEN) == 0) {
|
||||
WARN("Element %d of pool %d has been lost\n", e, i);
|
||||
struct ncclProxyArgs* free = state->pool;
|
||||
printf("Free list ");
|
||||
while (free) {
|
||||
printf("--> %ld ", OP_INDEX(free));
|
||||
free = free->next;
|
||||
}
|
||||
printf("\n");
|
||||
return ncclInternalError;
|
||||
}
|
||||
p->elems[e].idle -= OP_SEEN;
|
||||
}
|
||||
p = p->next;
|
||||
i++;
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
|
||||
static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) {
|
||||
struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
|
||||
if (proxyAppend) {
|
||||
if (shared && proxyAppend->opCount == args->opCount) {
|
||||
args->next = proxyAppend->next;
|
||||
proxyAppend->next = NULL;
|
||||
proxyAppend->nextGroup = args;
|
||||
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next));
|
||||
} else {
|
||||
proxyAppend->nextPeer = args;
|
||||
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
|
||||
}
|
||||
} else {
|
||||
// Nothing running for that peer. Add to the list
|
||||
if (state->ops == NULL) {
|
||||
// Create the list
|
||||
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element : \n", OP_INDEX(args), shared, args->opCount);
|
||||
state->ops = args;
|
||||
} else {
|
||||
// Append element at the end of the list
|
||||
struct ncclProxyArgs* last = state->ops;
|
||||
while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next;
|
||||
last->next = args;
|
||||
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element : \n", OP_INDEX(args),shared, args->opCount);
|
||||
}
|
||||
}
|
||||
*(args->proxyAppendPtr) = args;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
|
||||
if (peer < 0) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peerComm = args->channel->peers+peer;
|
||||
@@ -98,107 +170,168 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
|
||||
}
|
||||
if (connector->transportComm->proxy == NULL) return ncclSuccess;
|
||||
|
||||
struct ncclProxyState* state = &connector->comm->proxyState;
|
||||
struct ncclProxyArgs* op;
|
||||
NCCLCHECK(allocateArgs(connector->comm, &op));
|
||||
memcpy(op, args, sizeof(struct ncclProxyArgs));
|
||||
op->connector = connector;
|
||||
op->progress = connector->transportComm->proxy;
|
||||
op->state = ncclProxyOpReady;
|
||||
ProxyAppend(connector, op);
|
||||
|
||||
op->proxyAppendPtr =
|
||||
connector->conn.shared ?
|
||||
state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers
|
||||
&connector->proxyAppend; // Dedicated buffers
|
||||
|
||||
if (state->nextOps == NULL) state->nextOps = op;
|
||||
else state->nextOpsEnd->next = op;
|
||||
state->nextOpsEnd = op;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
|
||||
struct ncclRing* ring = &args->channel->ring;
|
||||
if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
|
||||
if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
|
||||
if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args));
|
||||
if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree up
|
||||
struct ncclTree* tree = &args->channel->treeUp;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
struct ncclTree* tree = &args->channel->tree;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree down
|
||||
struct ncclTree* tree = &args->channel->treeDn;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
struct ncclTree* tree = &args->channel->tree;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeUp) {
|
||||
// CollTree up
|
||||
struct ncclTree* tree = &args->channel->collTreeUp;
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
struct ncclTree* tree = &args->channel->collTree;
|
||||
NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeDown) {
|
||||
// CollTree down
|
||||
struct ncclTree* tree = &args->channel->collTreeDn;
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
struct ncclTree* tree = &args->channel->collTree;
|
||||
NCCLCHECK(SaveProxy(proxySend, tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) {
|
||||
struct ncclProxyArgs args;
|
||||
memset(&args, 0, sizeof(struct ncclProxyArgs));
|
||||
args.channel = channel;
|
||||
args.sliceSteps = 1;
|
||||
args.chunkSteps = 1;
|
||||
args.protocol = NCCL_PROTO_SIMPLE;
|
||||
args.opCount = info->comm->opCount;
|
||||
args.segment = segment;
|
||||
args.opCount = channel->workFifoTail-1;
|
||||
args.dtype = info->datatype;
|
||||
if (info->delta > 0 && info->sendbytes >= 0) {
|
||||
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
|
||||
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
|
||||
}
|
||||
if (info->delta > 0 && info->recvbytes >= 0) {
|
||||
int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
|
||||
args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
|
||||
args.recvbytes = info->recvbytes;
|
||||
args.sendbytes = 0;
|
||||
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, &args));
|
||||
}
|
||||
if (info->delta > 0 && info->sendbytes >= 0) {
|
||||
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
|
||||
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
args.sendbytes = info->sendbytes;
|
||||
args.recvbytes = 0;
|
||||
NCCLCHECK(SaveProxy(proxySend, peersend, &args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info) {
|
||||
const int peersPerChan = DIVUP(info->comm->nRanks, info->nChannels);
|
||||
const int chunkSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS*info->chunkSteps;
|
||||
const int loopSize = (info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1)*info->nchunksPerLoop*chunkSize;
|
||||
for (int p=0; p<peersPerChan; p++) {
|
||||
if ((peersPerChan == 1 && args->channel->id >= (info->nChannels/info->comm->nRanks)*info->comm->nRanks) ||
|
||||
(peersPerChan > 1 && args->channel->id*peersPerChan+p >= info->comm->nRanks))
|
||||
continue;
|
||||
// first channel is reserved for self copy
|
||||
if ((args->channel->id*peersPerChan+p)%info->comm->nRanks == 0)
|
||||
continue;
|
||||
int peerSend = (info->comm->rank+(args->channel->id*peersPerChan)+p)%info->comm->nRanks;
|
||||
int peerRecv = (2*info->comm->nRanks+info->comm->rank-(args->channel->id*peersPerChan)%info->comm->nRanks-p%info->comm->nRanks)%info->comm->nRanks;
|
||||
if (info->coll == ncclCollAllToAll || (info->coll == ncclCollScatter && info->comm->rank == info->root) ||
|
||||
(info->coll == ncclCollGather && peerSend == info->root))
|
||||
NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
|
||||
if (info->coll == ncclCollAllToAll || (info->coll == ncclCollGather && info->comm->rank == info->root) ||
|
||||
(info->coll == ncclCollScatter && peerRecv == info->root))
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
info->nBytes = info->sendcounts[peerSend]*info->count;
|
||||
int nLoops = (int)(DIVUP(info->nBytes, loopSize));
|
||||
args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
|
||||
TRACE(NCCL_NET,"peerSend %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
peerSend, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
nLoops, args->nsteps, info->comm);
|
||||
NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
|
||||
info->nBytes = info->recvcounts[peerRecv]*info->count;
|
||||
nLoops = (int)(DIVUP(info->nBytes, loopSize));
|
||||
args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
|
||||
TRACE(NCCL_NET,"peerRecv %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
peerRecv, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
nLoops, args->nsteps, info->comm);
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
|
||||
static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) {
|
||||
struct ncclProxyArgs* freeOp = *opPtr;
|
||||
DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup));
|
||||
if (*prevGroupPtr && *prevOpPtr) return ncclInternalError;
|
||||
if (freeOp->nextGroup) {
|
||||
// Part of a group : remove the element
|
||||
struct ncclProxyArgs* next = freeOp->nextGroup;
|
||||
*opPtr = next;
|
||||
if (*prevGroupPtr) {
|
||||
(*prevGroupPtr)->nextGroup = next;
|
||||
} else if (*prevOpPtr) {
|
||||
(*prevOpPtr)->next = next;
|
||||
} else {
|
||||
state->ops = next;
|
||||
}
|
||||
} else {
|
||||
struct ncclProxyArgs* next = freeOp->next;
|
||||
*opPtr = next;
|
||||
if ((*prevGroupPtr)) {
|
||||
(*prevGroupPtr)->next = next;
|
||||
(*prevGroupPtr)->nextGroup = NULL;
|
||||
(*prevGroupPtr)->nextPeer = freeOp->nextPeer;
|
||||
if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr;
|
||||
(*prevOpPtr) = *prevGroupPtr;
|
||||
(*prevGroupPtr) = NULL;
|
||||
} else {
|
||||
if (freeOp->nextPeer) {
|
||||
// replace op by nextPeer
|
||||
struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
|
||||
if (*prevOpPtr) {
|
||||
(*prevOpPtr)->next = nextPeer;
|
||||
} else {
|
||||
state->ops = nextPeer;
|
||||
}
|
||||
struct ncclProxyArgs* lastGroup = nextPeer;
|
||||
while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup;
|
||||
lastGroup->next = next;
|
||||
*(prevOpPtr) = lastGroup;
|
||||
} else {
|
||||
*(freeOp->proxyAppendPtr) = NULL;
|
||||
if (*prevOpPtr) {
|
||||
(*prevOpPtr)->next = next;
|
||||
} else {
|
||||
state->ops = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pthread_mutex_lock(&state->poolMutex);
|
||||
freeOp->next = state->pool;
|
||||
state->pool = freeOp;
|
||||
pthread_mutex_unlock(&state->poolMutex);
|
||||
DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
|
||||
NCCLCHECK(dumpProxyState(state));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
|
||||
struct ncclProxyArgs* prevOp = NULL;
|
||||
struct ncclProxyArgs* prevGroup = NULL;
|
||||
struct ncclProxyArgs* op = *opsPtr;
|
||||
while (op) {
|
||||
if (op->state == ncclProxyOpNone) return ncclInternalError;
|
||||
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
|
||||
// yet and might be cancelled before they even start. Hold on on those.
|
||||
if (op->opCount < comm->lastOpCount) {
|
||||
NCCLCHECK(op->progress(op));
|
||||
*idle &= op->idle;
|
||||
}
|
||||
if (op->state == ncclProxyOpNone) {
|
||||
NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup));
|
||||
} else {
|
||||
if (op->nextGroup) {
|
||||
prevGroup = op;
|
||||
prevOp = NULL;
|
||||
op = op->nextGroup;
|
||||
} else {
|
||||
prevOp = op;
|
||||
prevGroup = NULL;
|
||||
op = op->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -207,91 +340,170 @@ ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info)
|
||||
void* persistentThread(void *comm_) {
|
||||
struct ncclComm* comm = (struct ncclComm*)comm_;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* op = NULL;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int idle = 1;
|
||||
int idleSpin = 0;
|
||||
char threadName[16];
|
||||
sprintf(threadName, "NCCLproxy %5d", comm->rank);
|
||||
nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
|
||||
|
||||
pthread_mutex_lock(&state->opsMutex);
|
||||
struct ncclProxyArgs** opsPtr = &state->ops;
|
||||
while (1) {
|
||||
do {
|
||||
if (*comm->abortFlag) return NULL;
|
||||
if (op == NULL) {
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = state->ops;
|
||||
if (op == NULL) {
|
||||
if (state->stop) {
|
||||
// No more commands to process and proxy has been requested to stop
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
return NULL;
|
||||
}
|
||||
pthread_cond_wait(&state->cond, &state->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
if (LOAD(comm->abortFlag)) {
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while (LOAD(opsPtr) == NULL) {
|
||||
if (state->stop) {
|
||||
// No more commands to process and proxy has been requested to stop
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
return NULL;
|
||||
}
|
||||
} while (op == NULL);
|
||||
op->idle = 0;
|
||||
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
|
||||
// yet and might be cancelled before they even start. Hold on on those.
|
||||
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
|
||||
pthread_cond_wait(&state->cond, &state->opsMutex);
|
||||
}
|
||||
int idle = 1;
|
||||
ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
return NULL;
|
||||
}
|
||||
idle &= op->idle;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (!idle) idleSpin = 0;
|
||||
struct ncclProxyArgs *next = op->next;
|
||||
if (next->state == ncclProxyOpNone) {
|
||||
struct ncclProxyArgs *freeOp = next;
|
||||
if (next->nextPeer) {
|
||||
// Replace next by its next per-peer element.
|
||||
next = next->nextPeer;
|
||||
if (op != freeOp) {
|
||||
next->next = freeOp->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next->next = next;
|
||||
}
|
||||
} else {
|
||||
// Remove next from circular list
|
||||
next->connector->proxyAppend = NULL;
|
||||
if (op != freeOp) {
|
||||
next = next->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next = NULL;
|
||||
}
|
||||
}
|
||||
if (freeOp == state->ops) state->ops = next;
|
||||
freeOp->next = state->pool;
|
||||
state->pool = freeOp;
|
||||
if (idle) {
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
sched_yield(); // No request progressed. Let others run.
|
||||
pthread_mutex_lock(&state->opsMutex);
|
||||
}
|
||||
op = next;
|
||||
if (op == state->ops) {
|
||||
if (idle == 1) {
|
||||
if (++idleSpin == 10) {
|
||||
sched_yield();
|
||||
idleSpin = 0;
|
||||
}
|
||||
}
|
||||
idle = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&comm->proxyState.mutex);
|
||||
if (comm->proxyState.ops != NULL)
|
||||
pthread_cond_signal(&comm->proxyState.cond);
|
||||
pthread_mutex_unlock(&comm->proxyState.mutex);
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
pthread_mutex_lock(&state->opsMutex);
|
||||
|
||||
// Sort operations as we append them : collectives and
|
||||
// receives first, then sends.
|
||||
ncclProxyArgs* next, *prev = NULL, *op = state->nextOps;
|
||||
while (op) {
|
||||
next = op->next;
|
||||
if (op->sendbytes) {
|
||||
if (prev) prev->next = next;
|
||||
else state->nextOps = next;
|
||||
op->next = NULL;
|
||||
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
|
||||
} else prev = op;
|
||||
op = next;
|
||||
}
|
||||
op = state->nextOps;
|
||||
while (op) {
|
||||
next = op->next;
|
||||
op->next = NULL;
|
||||
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
|
||||
op = next;
|
||||
}
|
||||
state->nextOps = state->nextOpsEnd = NULL;
|
||||
NCCLCHECK(dumpProxyState(state));
|
||||
|
||||
if (state->ops != NULL)
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2);
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
|
||||
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
|
||||
if (state == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
comm->proxyState.sharedBuffs = state;
|
||||
state->nslots = ncclParamProxySharedBuffersCount();
|
||||
if (state->nslots == -2) {
|
||||
state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS;
|
||||
}
|
||||
state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
|
||||
}
|
||||
|
||||
char* buff;
|
||||
int* used;
|
||||
*size = 2*comm->p2pnChannels*state->slotSize*state->nslots;
|
||||
|
||||
if (cuda && state->cudaBuff[0] == NULL) {
|
||||
NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda));
|
||||
NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
|
||||
for (int i=0; i<2*comm->p2pnChannels; i++) {
|
||||
state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
|
||||
state->cudaUsed[i] = used + state->nslots*i;
|
||||
}
|
||||
} else if (state->hostBuff[0] == NULL) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
|
||||
NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
|
||||
for (int i=0; i<2*comm->p2pnChannels; i++) {
|
||||
state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
|
||||
state->hostUsed[i] = used + state->nslots*i;
|
||||
}
|
||||
}
|
||||
buff = cuda ? state->cudaBuff[0] : state->hostBuff[0];
|
||||
|
||||
*ptr = buff;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) {
|
||||
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
|
||||
// Use different pools for different channels and also separate send/recv.
|
||||
int p = 2*channel+type;
|
||||
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
|
||||
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
|
||||
if (buff == NULL) return ncclInternalError;
|
||||
int nslots = 1;
|
||||
while (nslots*state->slotSize < size) nslots *= 2;
|
||||
for (int s=0; s<state->nslots; s+=nslots) {
|
||||
int u = 0;
|
||||
for (int i=0; i<nslots; i++) u += used[s+i];
|
||||
if (u == 0) {
|
||||
for (int i=0; i<nslots; i++) used[s+i] = 1;
|
||||
*ptr = buff+state->slotSize*s;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
*ptr = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) {
|
||||
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
|
||||
int p = 2*channel+type;
|
||||
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
|
||||
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
|
||||
if (buff == NULL) return ncclInternalError;
|
||||
int nslots = 1;
|
||||
while (nslots*state->slotSize < size) nslots *= 2;
|
||||
int s = (ptr-buff)/state->slotSize;
|
||||
if (s < 0 || s+nslots > state->nslots) {
|
||||
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
|
||||
return ncclInternalError;
|
||||
}
|
||||
for (int i=0; i<nslots; i++) used[s+i] = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
|
||||
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
|
||||
if (state) {
|
||||
CUDACHECK(hipFree(state->cudaBuff[0]));
|
||||
free(state->cudaUsed[0]);
|
||||
NCCLCHECK(ncclCudaHostFree(state->hostBuff[0]));
|
||||
free(state->hostUsed[0]);
|
||||
free(state);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
if (!comm->proxyThread) {
|
||||
comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
|
||||
comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
comm->proxyState.ops = NULL;
|
||||
pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
|
||||
}
|
||||
@@ -302,21 +514,23 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
|
||||
// Request the proxy to stop and then wake it
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
pthread_mutex_lock(&state->opsMutex);
|
||||
state->stop = true;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
pthread_mutex_unlock(&state->opsMutex);
|
||||
if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
|
||||
|
||||
// Free off any memory allocated for the proxy arg pools
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
pthread_mutex_lock(&state->poolMutex);
|
||||
struct ncclProxyState* proxyState = &comm->proxyState;
|
||||
while (proxyState->pools != NULL) {
|
||||
struct ncclProxyPool *next = proxyState->pools->next;
|
||||
free(proxyState->pools);
|
||||
proxyState->pools = next;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
pthread_mutex_unlock(&state->poolMutex);
|
||||
|
||||
NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -20,15 +20,15 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
};
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
|
||||
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
|
||||
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -36,51 +36,86 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
|
||||
struct ncclConnect connect;
|
||||
struct ncclConnector* conn;
|
||||
uint32_t mask = 1 << channel->id;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
|
||||
comm->connectRecv[peer] |= mask;
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) { ++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
|
||||
comm->connectSend[peer] |= mask;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void dumpData(struct ncclConnect* data, int ndata) {
|
||||
for (int n=0; n<ndata; n++) {
|
||||
printf("[%d] ", n);
|
||||
uint8_t* d = (uint8_t*)data;
|
||||
for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
|
||||
struct ncclConnect data[2*MAXCHANNELS];
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer];
|
||||
|
||||
struct ncclConnect* recvData = data;
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (recvMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
|
||||
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
|
||||
}
|
||||
}
|
||||
struct ncclConnect* sendData = recvData+recvChannels;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (sendMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
|
||||
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
|
||||
}
|
||||
}
|
||||
|
||||
if (sendPeer == recvPeer) {
|
||||
if (recvChannels+sendChannels) {
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
|
||||
sendData = data;
|
||||
recvData = data+sendChannels;
|
||||
}
|
||||
} else {
|
||||
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
|
||||
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
|
||||
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
|
||||
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
|
||||
}
|
||||
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (sendMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (recvMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) {++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) {++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -27,10 +27,8 @@ struct reqSlot {
|
||||
|
||||
struct collNetSendResources {
|
||||
void* collNetSendComm;
|
||||
struct ncclSendMem* hostSendMem;
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
uint32_t* llData;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
@@ -46,10 +44,8 @@ struct collNetSendResources {
|
||||
struct collNetRecvResources {
|
||||
void* netListenComm;
|
||||
void* collNetRecvComm;
|
||||
struct ncclSendMem* hostSendMem;
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
uint32_t* llData;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
@@ -68,16 +64,15 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
|
||||
}
|
||||
|
||||
/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
|
||||
ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
struct collNetSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
|
||||
resources->devHostSendMem = resources->hostSendMem;
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
|
||||
@@ -85,8 +80,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
|
||||
resources->devHostRecvMem = resources->hostRecvMem;
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
@@ -95,16 +89,15 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
}
|
||||
|
||||
/* Setup recv connector */
|
||||
ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct collNetRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
|
||||
resources->devHostSendMem = resources->hostSendMem;
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
|
||||
@@ -112,8 +105,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
|
||||
resources->devHostRecvMem = resources->hostRecvMem;
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
|
||||
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
|
||||
|
||||
@@ -124,25 +116,25 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
|
||||
ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
|
||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
|
||||
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
|
||||
offset += send->comm->buffSizes[p];
|
||||
}
|
||||
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount/Fifos are always on host
|
||||
send->conn.tail = &resources->devHostRecvMem->tail;
|
||||
send->conn.fifo = resources->devHostRecvMem->sizesFifo;
|
||||
send->conn.head = &resources->devHostSendMem->head;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
|
||||
send->conn.tail = &resources->recvMem->tail;
|
||||
send->conn.sizesFifo = resources->recvMem->sizesFifo;
|
||||
send->conn.head = &resources->sendMem->head;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
|
||||
|
||||
// Get info from recv side
|
||||
resources->collNetRank = rank;
|
||||
@@ -160,24 +152,24 @@ ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, in
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
|
||||
ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
|
||||
// Setup device pointers
|
||||
struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
|
||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
resources->collNetRank = rank;
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
|
||||
recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
|
||||
offset += recv->comm->buffSizes[p];
|
||||
}
|
||||
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount are always on host
|
||||
recv->conn.tail = &resources->devHostRecvMem->tail;
|
||||
recv->conn.head = &resources->devHostSendMem->head;
|
||||
recv->conn.tail = &resources->recvMem->tail;
|
||||
recv->conn.head = &resources->sendMem->head;
|
||||
|
||||
// Connect to coll comm
|
||||
collNetHandle_t** handlePtrs = NULL;
|
||||
@@ -214,8 +206,8 @@ cleanup:
|
||||
|
||||
ncclResult_t collNetSendFree(void* sendTransportResources) {
|
||||
struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
|
||||
if (resources->collNetSendComm) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
@@ -229,12 +221,12 @@ ncclResult_t collNetSendFree(void* sendTransportResources) {
|
||||
|
||||
ncclResult_t collNetRecvFree(void* recvTransportResources) {
|
||||
struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
|
||||
if (resources->collNetRecvComm) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
free(resources->llData);
|
||||
@@ -257,96 +249,84 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
args->head = resources->step;
|
||||
args->tail = resources->step;
|
||||
args->end = args->head + args->nsteps;
|
||||
args->posted = args->transmitted = args->done = resources->step;
|
||||
args->end = resources->step + args->nsteps;
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* sendMhandle = resources->sendMhandles[p];
|
||||
void* recvMhandle = resources->recvMhandles[p];
|
||||
args->idle = 1;
|
||||
struct reqSlot* reqFifo = resources->reqFifo;
|
||||
if (args->head < args->end) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
|
||||
&& reqFifo[buffSlot].recvBuff != NULL) {
|
||||
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
|
||||
int buffSlot = args->transmitted%NCCL_STEPS;
|
||||
if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS
|
||||
&& LOAD(&reqFifo[buffSlot].recvBuff) != NULL) {
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
char* buff = localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (args->protocol == NCCL_PROTO_LL) {
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
if (size != -1) {
|
||||
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
int ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
}
|
||||
if (ready) {
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
//separate data from flag
|
||||
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *d1 = &lines[i].data1;
|
||||
volatile uint32_t *d2 = &lines[i].data2;
|
||||
sendBuff[2*i] = LOAD(d1);
|
||||
sendBuff[2*i+1] = LOAD(d2);
|
||||
}
|
||||
int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (args->tail < LOAD(recvTail)) {
|
||||
// Send through network
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
|
||||
// Pack data into another buffer
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
|
||||
buff = (char*)sendBuff;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
volatile uint32_t *d1 = &lines[i].data1;
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
volatile uint32_t *d2 = &lines[i].data2;
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
sendBuff[2*i] = LOAD(d1);
|
||||
sendBuff[2*i+1] = LOAD(d2);
|
||||
}
|
||||
size = nFifoLines*2*sizeof(uint32_t);
|
||||
}
|
||||
}
|
||||
if (args->head < args->tail) {
|
||||
int done, size;
|
||||
int buffSlot = args->head%NCCL_STEPS;
|
||||
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
|
||||
reqFifo[buffSlot].size = size;
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
// (reordered store after store is possible on POWER, though not on x86)
|
||||
__sync_synchronize();
|
||||
reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
|
||||
args->head += args->sliceSteps;
|
||||
STORE(&resources->hostSendMem->head, args->head);
|
||||
args->idle = 0;
|
||||
if (ready) {
|
||||
// Data is ready, try to send.
|
||||
int count = size/ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpNone;
|
||||
// Check whether the network has completed some send operations.
|
||||
if (args->done < args->transmitted) {
|
||||
int done, size;
|
||||
int buffSlot = args->done%NCCL_STEPS;
|
||||
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
|
||||
STORE(&reqFifo[buffSlot].size, size);
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
// (reordered store after store is possible on POWER, though not on x86)
|
||||
__sync_synchronize();
|
||||
STORE(&reqFifo[buffSlot].recvBuff, NULL); // Notify recvProxy
|
||||
args->done += args->sliceSteps;
|
||||
resources->sendMem->head = args->done;
|
||||
args->idle = 0;
|
||||
if (args->done == args->end) {
|
||||
resources->step = args->end;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -361,56 +341,79 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
args->head = resources->step;
|
||||
args->tail = resources->step;
|
||||
args->end = args->head + args->nsteps;
|
||||
args->posted = args->received = args->transmitted = args->done = resources->step;
|
||||
args->end = resources->step + args->nsteps;
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
args->idle = 1;
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = resources->mhandles[p];
|
||||
struct reqSlot* reqFifo = resources->reqFifo;
|
||||
if (args->head < args->end) {
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
|
||||
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
|
||||
reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
if (args->tail > args->head) {
|
||||
int buffSlot = args->head%NCCL_STEPS;
|
||||
if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
|
||||
args->head += args->sliceSteps;
|
||||
if (args->protocol == NCCL_PROTO_LL) { // ll
|
||||
// re-attach flag
|
||||
uint32_t flag = args->head;
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
|
||||
int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
|
||||
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
|
||||
}
|
||||
} else if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
|
||||
resources->hostRecvMem->tail = args->head;
|
||||
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
|
||||
int buffSlot = args->posted%NCCL_STEPS;
|
||||
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
|
||||
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
|
||||
STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
|
||||
args->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (args->posted > args->received) {
|
||||
int buffSlot = args->received%NCCL_STEPS;
|
||||
if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
|
||||
if (args->protocol == NCCL_PROTO_LL) { // ll
|
||||
// re-attach flag
|
||||
uint32_t flag = NCCL_LL_FLAG(args->received + 1);
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
|
||||
int nFifoLines = DIVUP(LOAD(&reqFifo[buffSlot].size), 2*sizeof(uint32_t));
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
|
||||
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
|
||||
}
|
||||
args->idle = 0;
|
||||
}
|
||||
args->received += args->sliceSteps;
|
||||
if (LOAD(&reqFifo[buffSlot].size) > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
|
||||
NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, LOAD(&reqFifo[buffSlot].size), mhandle, args->requests+buffSlot));
|
||||
} else {
|
||||
args->requests[buffSlot] = NULL;
|
||||
}
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpNone;
|
||||
if (args->received > args->transmitted) {
|
||||
// Progress flush operations
|
||||
int buffSlot = args->transmitted%NCCL_STEPS;
|
||||
int done = 1;
|
||||
if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
args->transmitted += args->sliceSteps;
|
||||
__sync_synchronize();
|
||||
resources->recvMem->tail = args->transmitted;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (args->transmitted > args->done) {
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
uint64_t done = LOAD(sendHead);
|
||||
while (done > args->done &&
|
||||
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
|
||||
args->transmitted > args->done) {
|
||||
args->done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
if (args->done == args->end) {
|
||||
resources->step = args->end;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "net.h"
|
||||
#include "graph.h"
|
||||
#include <sys/time.h>
|
||||
#include <numaif.h>
|
||||
#include "collectives.h"
|
||||
|
||||
struct netConnectInfo {
|
||||
ncclNetHandle_t netHandle;
|
||||
@@ -25,6 +25,7 @@ struct netSendResources {
|
||||
struct ncclRecvMem* recvMem;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int shared;
|
||||
char* buffers[LOC_COUNT];
|
||||
int buffSizes[LOC_COUNT];
|
||||
void* mhandles[LOC_COUNT];
|
||||
@@ -40,6 +41,7 @@ struct netRecvResources {
|
||||
struct ncclRecvMem* recvMem;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int shared;
|
||||
char* buffers[LOC_COUNT];
|
||||
int buffSizes[LOC_COUNT];
|
||||
void* mhandles[LOC_COUNT];
|
||||
@@ -55,118 +57,118 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
|
||||
|
||||
/* Determine if we will use this transport for this peer and return connect
|
||||
* information for this peer */
|
||||
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
struct netSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
|
||||
|
||||
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
send->conn.tail = &resources->recvMem->tail;
|
||||
send->conn.fifo = resources->recvMem->sizesFifo;
|
||||
send->conn.sizesFifo = resources->recvMem->sizesFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
|
||||
send->conn.head = &resources->sendMem->head;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
|
||||
resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
|
||||
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
if (resources->shared == 0) {
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
}
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
buffSizes[p] = send->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
}
|
||||
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
// Only allocate buffers for simple for p2p connections
|
||||
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
char line[16];
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
int status[1] = {-1};
|
||||
line[0]= 0;
|
||||
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
|
||||
sprintf(line, "/MEM%d", status[0]);
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : line);
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct netRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
|
||||
|
||||
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
recv->conn.tail = &resources->recvMem->tail;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
|
||||
recv->conn.head = &resources->sendMem->head;
|
||||
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
}
|
||||
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
buffSizes[p] = recv->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
}
|
||||
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
// Only allocate buffers for simple for p2p connections
|
||||
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
char line[16];
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
int status[1] = {-1};
|
||||
line[0]= 0;
|
||||
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
|
||||
sprintf(line, "/MEM%d", status[0]);
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : line);
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
|
||||
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
|
||||
struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
|
||||
@@ -174,6 +176,13 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
// Connect to remote peer
|
||||
NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
|
||||
|
||||
if (resources->shared) {
|
||||
// Get shared buffers
|
||||
int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
|
||||
resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
|
||||
}
|
||||
@@ -184,7 +193,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
}
|
||||
|
||||
/* Connect to this peer */
|
||||
ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
// Setup device pointers
|
||||
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
|
||||
|
||||
@@ -192,6 +201,13 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
|
||||
|
||||
if (resources->shared) {
|
||||
// Get shared buffers
|
||||
int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
|
||||
resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
|
||||
}
|
||||
@@ -209,8 +225,10 @@ ncclResult_t netSendFree(void* transportResources) {
|
||||
if (resources->buffers[l])
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
if (resources->shared == 0) {
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
}
|
||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
@@ -224,138 +242,144 @@ ncclResult_t netRecvFree(void* transportResources) {
|
||||
if (resources->buffers[l])
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
if (resources->shared == 0) {
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
}
|
||||
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
|
||||
|
||||
ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
args->head = resources->step;
|
||||
args->tail = resources->step;
|
||||
args->end = args->head + args->nsteps;
|
||||
args->posted = args->transmitted = args->done = resources->step;
|
||||
args->end = resources->step + args->nsteps;
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = *(resources->mhandlesProto[p]);
|
||||
args->idle = 1;
|
||||
if (args->head < args->end) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
int buffSize = stepSize*args->sliceSteps;
|
||||
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
|
||||
if (args->sendbytes < buffSize) buffSize = args->sendbytes;
|
||||
// Post buffers to the GPU
|
||||
if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) {
|
||||
if (resources->shared) {
|
||||
char* ptr;
|
||||
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr));
|
||||
if (ptr == NULL) return ncclInternalError;
|
||||
resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr;
|
||||
__sync_synchronize();
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
args->posted += args->sliceSteps;
|
||||
STORE(sendHead, args->posted - NCCL_STEPS);
|
||||
} else args->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Check whether we received data from the GPU and send it to the network
|
||||
int buffSlot = args->transmitted%NCCL_STEPS;
|
||||
if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) {
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (args->protocol == NCCL_PROTO_LL128) {
|
||||
if (args->tail < LOAD(recvTail)) {
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
int ready = resources->useGdr;
|
||||
if (!ready) {
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
// called threadfence()
|
||||
uint64_t flag = args->tail + 1;
|
||||
int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
|
||||
ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
|
||||
}
|
||||
}
|
||||
if (ready) {
|
||||
// Send through network
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
int ready = resources->useGdr;
|
||||
if (!ready) {
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
// called threadfence()
|
||||
uint64_t flag = args->transmitted + 1;
|
||||
int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
volatile uint64_t* lines = (volatile uint64_t*)buff;
|
||||
ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
|
||||
}
|
||||
}
|
||||
} else if (args->protocol == NCCL_PROTO_LL) {
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
if (size != -1) {
|
||||
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
size = nFifoLines * sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
int ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
}
|
||||
if (ready) {
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (args->tail < LOAD(recvTail)) {
|
||||
// Send through network
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->channel->active_req == 0) {
|
||||
gettimeofday(&args->channel->tvs, NULL);
|
||||
args->channel->sizes = 0;
|
||||
}
|
||||
args->channel->active_req ++;
|
||||
args->channel->sizes += LOAD(sizesFifo+buffSlot);
|
||||
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
|
||||
#endif
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->head < args->tail) {
|
||||
int done;
|
||||
int buffSlot = args->head%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
if (ready) {
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
args->channel->active_req --;
|
||||
if (args->channel->active_req == 0) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
|
||||
if (delta) {
|
||||
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
|
||||
args->channel->bw_count ++;
|
||||
}
|
||||
gettimeofday(&args->channel->tvs, NULL);
|
||||
args->channel->sizes = 0;
|
||||
}
|
||||
}
|
||||
args->channel->active_req ++;
|
||||
args->channel->sizes += LOAD(sizesFifo+buffSlot);
|
||||
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
|
||||
#endif
|
||||
args->head += args->sliceSteps;
|
||||
STORE(&resources->sendMem->head, args->head);
|
||||
args->idle = 0;
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
args->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpNone;
|
||||
// Check whether the network has completed some send operations.
|
||||
if (args->done < args->transmitted) {
|
||||
int done;
|
||||
int buffSlot = args->done%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
args->channel->active_req --;
|
||||
if (args->channel->active_req == 0) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
|
||||
if (delta) {
|
||||
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
|
||||
args->channel->bw_count ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (resources->shared) {
|
||||
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
|
||||
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr));
|
||||
}
|
||||
args->done += args->sliceSteps;
|
||||
|
||||
if (resources->shared == 0) {
|
||||
resources->sendMem->head = args->done;
|
||||
}
|
||||
args->idle = 0;
|
||||
if (args->done == args->end) {
|
||||
resources->step = args->end;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -366,45 +390,57 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
args->head = resources->step;
|
||||
args->tail = resources->step;
|
||||
args->end = args->head + args->nsteps;
|
||||
args->posted = args->received = args->transmitted = args->done = resources->step;
|
||||
args->end = resources->step + args->nsteps;
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
args->idle = 1;
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = *(resources->mhandlesProto[p]);
|
||||
if (args->head < args->end) {
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int sliceSize = stepSize * args->sliceSteps;
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (args->channel->active_req == 0) {
|
||||
gettimeofday(&args->channel->tvs, NULL);
|
||||
args->channel->sizes = 0;
|
||||
}
|
||||
args->channel->active_req ++;
|
||||
}
|
||||
#endif
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
int buffSize = stepSize*args->sliceSteps;
|
||||
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
|
||||
if (args->recvbytes < buffSize) buffSize = args->recvbytes;
|
||||
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
|
||||
int buffSlot = args->posted%NCCL_STEPS;
|
||||
char* ptr;
|
||||
if (resources->shared) {
|
||||
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr));
|
||||
if (ptr == NULL) return ncclInternalError;
|
||||
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
|
||||
STORE(ptrsFifo+buffSlot, ptr);
|
||||
} else {
|
||||
ptr = localBuff+buffSlot*stepSize;
|
||||
}
|
||||
if (args->tail > args->head) {
|
||||
int buffSlot = args->head%NCCL_STEPS;
|
||||
int done, size;
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
|
||||
if (done) {
|
||||
args->head += args->sliceSteps;
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (args->channel->active_req == 0) {
|
||||
gettimeofday(&args->channel->tvs, NULL);
|
||||
args->channel->sizes = 0;
|
||||
}
|
||||
args->channel->active_req ++;
|
||||
}
|
||||
#endif
|
||||
args->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
} else if (resources->shared) {
|
||||
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
|
||||
}
|
||||
}
|
||||
if (args->posted > args->received) {
|
||||
int buffSlot = args->received%NCCL_STEPS;
|
||||
int done, size;
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
|
||||
if (done) {
|
||||
args->received += args->sliceSteps;
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
args->channel->active_req --;
|
||||
args->channel->sizes += size;
|
||||
args->channel->recv_byte += size;
|
||||
@@ -417,18 +453,50 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
args->channel->bw_count ++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
|
||||
STORE(&resources->recvMem->tail, args->head);
|
||||
}
|
||||
args->idle = 0;
|
||||
}
|
||||
#endif
|
||||
if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
|
||||
// Don't pass data to the GPU yet, flush first.
|
||||
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
|
||||
char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
|
||||
NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot));
|
||||
} else {
|
||||
args->requests[buffSlot] = NULL;
|
||||
}
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpNone;
|
||||
if (args->received > args->transmitted) {
|
||||
// Progress flush operations
|
||||
int buffSlot = args->transmitted%NCCL_STEPS;
|
||||
int done = 1;
|
||||
if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
args->transmitted += args->sliceSteps;
|
||||
__sync_synchronize();
|
||||
resources->recvMem->tail = args->transmitted;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (args->transmitted > args->done) {
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
uint64_t done = LOAD(sendHead);
|
||||
while (done > args->done &&
|
||||
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
|
||||
args->transmitted > args->done) {
|
||||
if (resources->shared) {
|
||||
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
|
||||
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
|
||||
}
|
||||
args->done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
if (args->done == args->end) {
|
||||
resources->step = args->end;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -25,9 +25,8 @@
|
||||
#include "ibvwrap.h"
|
||||
|
||||
#define USE_RDMA_WRITE 1
|
||||
#define USE_RDMA_SEND_INLINE 0
|
||||
#define MAXNAMESIZE 64
|
||||
static char ncclIbIfName[MAX_IF_NAME_SIZE];
|
||||
static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union socketAddress ncclIbIfAddr;
|
||||
|
||||
static int ncclNIbDevs = -1;
|
||||
@@ -58,6 +57,8 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
|
||||
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
|
||||
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
|
||||
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
|
||||
NCCL_PARAM(IbSl, "IB_SL", 0);
|
||||
NCCL_PARAM(IbTc, "IB_TC", 0);
|
||||
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
|
||||
@@ -200,7 +201,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
|
||||
}
|
||||
line[1023] = '\0';
|
||||
char addrline[1024];
|
||||
char addrline[SOCKET_NAME_MAXLEN+1];
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
|
||||
}
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
@@ -251,7 +252,7 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define MAX_REQUESTS 128
|
||||
#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
|
||||
|
||||
struct ncclIbQpInfo {
|
||||
uint32_t lid;
|
||||
@@ -272,18 +273,19 @@ struct ncclIbHandle {
|
||||
union socketAddress connectAddr;
|
||||
};
|
||||
|
||||
struct ncclIbVerbs {
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_cq* cq;
|
||||
};
|
||||
|
||||
struct ncclIbRequest {
|
||||
int used;
|
||||
int type;
|
||||
struct ncclIbVerbs* verbs;
|
||||
int done;
|
||||
int events;
|
||||
int size;
|
||||
int free;
|
||||
};
|
||||
|
||||
struct ncclIbVerbs {
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_cq* cq;
|
||||
uint64_t pad[2];
|
||||
struct ncclIbRequest reqs[MAX_REQUESTS];
|
||||
};
|
||||
|
||||
struct ncclIbListenComm {
|
||||
@@ -297,18 +299,23 @@ struct alignas(64) ncclIbSendFifo {
|
||||
uint32_t seq;
|
||||
uint32_t rkey;
|
||||
uint32_t ready;
|
||||
uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
|
||||
};
|
||||
|
||||
struct ncclIbSendComm {
|
||||
struct ncclIbVerbs verbs;
|
||||
struct ncclIbSendFifo fifo[MAX_REQUESTS];
|
||||
struct ncclIbRequest reqs[MAX_REQUESTS];
|
||||
uint32_t fifoHead;
|
||||
int fd;
|
||||
int ready;
|
||||
struct ibv_qp* qp;
|
||||
struct ibv_mr* fifoMr;
|
||||
};
|
||||
// The SendFifo needs to be 32-byte aligned and each element needs
|
||||
// to be a 32-byte multiple, so that an entry does not get split and
|
||||
// written out of order when IB Relaxed Ordering is enabled
|
||||
static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
|
||||
static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
|
||||
|
||||
struct ncclIbGpuFlush {
|
||||
int enabled;
|
||||
@@ -331,16 +338,17 @@ struct ncclIbRemFifo {
|
||||
struct ncclIbRecvComm {
|
||||
struct ncclIbVerbs verbs;
|
||||
struct ncclIbRemFifo remFifo;
|
||||
struct ncclIbRequest reqs[MAX_REQUESTS];
|
||||
int fd;
|
||||
int ready;
|
||||
struct ibv_qp* qp;
|
||||
struct ncclIbGpuFlush gpuFlush;
|
||||
};
|
||||
static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
|
||||
|
||||
ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
|
||||
NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
|
||||
NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
|
||||
// Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
|
||||
NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS, NULL, NULL, 0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -356,17 +364,17 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
|
||||
qpInitAttr.send_cq = verbs->cq;
|
||||
qpInitAttr.recv_cq = verbs->cq;
|
||||
qpInitAttr.qp_type = IBV_QPT_RC;
|
||||
// We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
|
||||
// We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
|
||||
qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
|
||||
qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
|
||||
qpInitAttr.cap.max_send_sge = 1;
|
||||
qpInitAttr.cap.max_recv_sge = 1;
|
||||
qpInitAttr.cap.max_inline_data = 0;
|
||||
qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
|
||||
NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
|
||||
struct ibv_qp_attr qpAttr;
|
||||
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
|
||||
qpAttr.qp_state = IBV_QPS_INIT;
|
||||
qpAttr.pkey_index = 0;
|
||||
qpAttr.pkey_index = ncclParamIbPkey();
|
||||
qpAttr.port_num = ib_port;
|
||||
qpAttr.qp_access_flags = access_flags;
|
||||
NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
|
||||
@@ -481,7 +489,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
||||
socklen_t socklen = sizeof(struct sockaddr_in);
|
||||
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
|
||||
struct ncclIbQpInfo remQpInfo;
|
||||
NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
|
||||
NCCLCHECK(socketRecv(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
|
||||
|
||||
// IB setup
|
||||
ibv_context* ctx = ncclIbDevs[lComm->dev].context;
|
||||
@@ -509,14 +517,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
||||
NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||
rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
|
||||
rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
|
||||
|
||||
#if USE_RDMA_SEND_INLINE
|
||||
// Determine whether the remFifo element data can be sent INLINE
|
||||
struct ibv_qp_attr attr;
|
||||
struct ibv_qp_init_attr init_attr;
|
||||
NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
|
||||
if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
|
||||
#endif
|
||||
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
|
||||
|
||||
// Allocate Flush dummy buffer for GPU Direct RDMA
|
||||
rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
|
||||
@@ -553,16 +554,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
|
||||
ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
|
||||
for (int i=0; i<MAX_REQUESTS; i++) {
|
||||
struct ncclIbRequest* r = reqs+i;
|
||||
struct ncclIbRequest* r = verbs->reqs+i;
|
||||
if (r->used == 0) {
|
||||
r->used = 1;
|
||||
r->type = 0;
|
||||
r->verbs = NULL;
|
||||
r->done = 0;
|
||||
r->verbs = verbs;
|
||||
r->events = 1;
|
||||
r->size = -1;
|
||||
r->free = 0;
|
||||
*req = r;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -571,6 +571,10 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
|
||||
*req = NULL;
|
||||
return ncclInternalError;
|
||||
}
|
||||
ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
|
||||
r->used = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
|
||||
struct ncclIbQpInfo remQpInfo;
|
||||
@@ -585,7 +589,6 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
|
||||
NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
|
||||
NCCLCHECK(ncclIbRtsQp(qp));
|
||||
comm->ready = 1;
|
||||
|
||||
// Block until this is done. It *should* not block indefinitely.
|
||||
NCCLCHECK(socketSend(comm->fd, &comm->ready, sizeof(int)));
|
||||
|
||||
@@ -606,6 +609,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
|
||||
#define REG_ALIGN (4096)
|
||||
|
||||
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||
static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
|
||||
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
|
||||
uint64_t addr = (uint64_t)data;
|
||||
assert(size > 0);
|
||||
@@ -639,8 +643,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
|
||||
if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }
|
||||
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
|
||||
req->verbs = &comm->verbs;
|
||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||
req->size = size;
|
||||
|
||||
struct ibv_send_wr wr;
|
||||
@@ -656,14 +659,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
|
||||
wr.sg_list = &sge;
|
||||
wr.num_sge = 1;
|
||||
}
|
||||
#if USE_RDMA_WRITE == 0
|
||||
wr.opcode = IBV_WR_SEND;
|
||||
wr.send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
int useAr = 0;
|
||||
if (size > ncclParamIbArThreshold()) {
|
||||
useAr = 1;
|
||||
}
|
||||
#if USE_RDMA_WRITE
|
||||
#else
|
||||
__sync_synchronize(); // order the readyPtr load against rkey load below
|
||||
// Sanity checks to catch user collective call count/size mismatches
|
||||
// plus any potential programming errors
|
||||
@@ -672,7 +671,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
|
||||
size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int useAr = 0;
|
||||
if (size > ncclParamIbArThreshold()) {
|
||||
useAr = 1;
|
||||
}
|
||||
wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED;
|
||||
wr.wr.rdma.remote_addr = LOAD(&slot->addr);
|
||||
wr.wr.rdma.rkey = LOAD(&slot->rkey);
|
||||
wr.imm_data = size; // Send the message size via imm_data
|
||||
@@ -696,7 +700,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
|
||||
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
wr.sg_list = NULL;
|
||||
wr.num_sge = 0;
|
||||
wr.send_flags &= ~IBV_SEND_SIGNALED;
|
||||
wr.send_flags |= IBV_SEND_SIGNALED;
|
||||
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
|
||||
}
|
||||
#endif
|
||||
@@ -704,28 +708,51 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
|
||||
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
|
||||
struct ibv_send_wr wr;
|
||||
memset(&wr, 0, sizeof(wr));
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
|
||||
req->verbs = &comm->verbs;
|
||||
req->free = 1; // Not a user req ; free as soon as it is complete.
|
||||
wr.wr_id = (uint64_t)req;
|
||||
|
||||
struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
|
||||
int slot = comm->remFifo.tail%MAX_REQUESTS;
|
||||
struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
|
||||
localElem->addr = addr;
|
||||
localElem->rkey = rkey;
|
||||
localElem->ready = 1;
|
||||
localElem->size = size; // Sanity/Debugging
|
||||
localElem->seq = comm->remFifo.tail; // Sanity/Debugging
|
||||
wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
|
||||
wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
|
||||
wr.wr.rdma.rkey = comm->remFifo.rkey;
|
||||
comm->remFifo.sge.addr = (uint64_t)localElem;
|
||||
wr.sg_list = &comm->remFifo.sge;
|
||||
wr.num_sge = 1;
|
||||
wr.opcode = IBV_WR_RDMA_WRITE;
|
||||
wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
|
||||
wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE
|
||||
|
||||
// We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise
|
||||
// the send queue will never empty.
|
||||
//
|
||||
// From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/
|
||||
// "How to use Unsignaled Completion?" / "Gotchas and Pitfalls"
|
||||
// All posted Send Requested, Signaled and Unsignaled, are considered outstanding until
|
||||
// a Work Completion that they, or Send Requests that were posted after them, was polled
|
||||
// from the Completion Queue associated with the Send Queue. This means if one works with
|
||||
// a Queue Pair that was configured to work with Unsignaled Completions, he must make
|
||||
// sure that occasionally (before the Send Queue is full with outstanding Send Requests)
|
||||
// a Send Request that generate Work Completion will be posted.
|
||||
//
|
||||
// Not following this rule may lead to a case that the Send Queue is full with Send
|
||||
// Requests that won't generate Work Completion:
|
||||
//
|
||||
// - The Send Queue is full, so no new Send Requests can be posted to it
|
||||
// - The Send Queue can't be emptied, since no Work Completion can be generated anymore
|
||||
// (the reason is that no Work Completion, that can generate Work Completion that
|
||||
// polling it will empty the Send Queue, can be posted)
|
||||
// - The status of all posted Send Request is considered unknown
|
||||
//
|
||||
if (slot == 0) {
|
||||
wr.send_flags |= IBV_SEND_SIGNALED;
|
||||
wr.wr_id = (uint64_t)req;
|
||||
req->events++;
|
||||
}
|
||||
|
||||
struct ibv_send_wr* bad_wr;
|
||||
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
|
||||
@@ -742,8 +769,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
|
||||
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
|
||||
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
|
||||
req->verbs = &comm->verbs;
|
||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||
req->size = size;
|
||||
|
||||
struct ibv_recv_wr wr;
|
||||
@@ -765,17 +791,16 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
|
||||
*request = req;
|
||||
|
||||
// Post to FIFO to notify sender
|
||||
NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
|
||||
NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
|
||||
ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
|
||||
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
||||
if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
|
||||
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
|
||||
req->verbs = &comm->verbs;
|
||||
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
|
||||
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
|
||||
|
||||
struct ibv_send_wr wr;
|
||||
@@ -792,11 +817,7 @@ ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
|
||||
struct ibv_send_wr* bad_wr;
|
||||
NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
|
||||
|
||||
int done = 0;
|
||||
while (done == 0) {
|
||||
NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
|
||||
}
|
||||
|
||||
*request = req;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -805,10 +826,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
|
||||
*done = 0;
|
||||
|
||||
while (1) {
|
||||
if (r->done == 1) {
|
||||
if (r->events == 0) {
|
||||
*done = 1;
|
||||
if (size) *size = r->size;
|
||||
r->used = 0;
|
||||
NCCLCHECK(ncclIbFreeRequest(r));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -833,11 +854,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
|
||||
doneReq->size = wc->imm_data;
|
||||
#endif
|
||||
}
|
||||
doneReq->done = 1;
|
||||
if (doneReq->free == 1) {
|
||||
// This is an internal (FIFO post) req. Free it immediately.
|
||||
doneReq->used = 0;
|
||||
}
|
||||
doneReq->events--;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -892,7 +909,7 @@ ncclNet_t ncclNetIb = {
|
||||
ncclIbDeregMr,
|
||||
ncclIbIsend,
|
||||
ncclIbIrecv,
|
||||
ncclIbFlush,
|
||||
ncclIbIflush,
|
||||
ncclIbTest,
|
||||
ncclIbCloseSend,
|
||||
ncclIbCloseRecv,
|
||||
|
||||
@@ -49,17 +49,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
|
||||
WARN("NET/Socket : no interface found");
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
char line[1024];
|
||||
char addrline[1024];
|
||||
#define MAX_LINE_LEN (2047)
|
||||
char line[MAX_LINE_LEN+1];
|
||||
char addrline[SOCKET_NAME_MAXLEN+1];
|
||||
line[0] = '\0';
|
||||
addrline[SOCKET_NAME_MAXLEN] = '\0';
|
||||
for (int i=0; i<ncclNetIfs; i++) {
|
||||
strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
|
||||
memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
|
||||
NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
|
||||
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
|
||||
socketToString(&addrs[i].sa, addrline));
|
||||
}
|
||||
line[1023] = '\0';
|
||||
line[MAX_LINE_LEN] = '\0';
|
||||
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
|
||||
}
|
||||
}
|
||||
@@ -113,8 +115,7 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
|
||||
|
||||
#define MAX_SOCKETS 64
|
||||
#define MAX_THREADS 16
|
||||
#define MAX_REQUESTS 128
|
||||
#define MAX_QUEUE_LEN MAX_REQUESTS
|
||||
#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
|
||||
#define MIN_CHUNKSIZE (64*1024)
|
||||
|
||||
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
|
||||
@@ -150,6 +151,7 @@ struct ncclSocketRequest {
|
||||
|
||||
struct ncclSocketTaskQueue {
|
||||
int next;
|
||||
int len;
|
||||
struct ncclSocketTask* tasks;
|
||||
};
|
||||
|
||||
@@ -189,7 +191,7 @@ void* persistentSocketThread(void *args_) {
|
||||
while (1) {
|
||||
int idle = 1;
|
||||
int mark = myQueue->next; // mark newest task seen
|
||||
for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
|
||||
for (int i=0; i<myQueue->len; i+=nSocksPerThread) {
|
||||
int repeat;
|
||||
do {
|
||||
repeat = 0;
|
||||
@@ -364,7 +366,11 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
|
||||
struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
|
||||
// create helper threads and prepare per-thread task queue
|
||||
if (queue->tasks == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
|
||||
// each request can be divided up to nSocks tasks, and
|
||||
// these tasks are distributed to nThreads threads,
|
||||
// we need to make sure each thread queue has enough slots for MAX_REQUESTS
|
||||
queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads);
|
||||
NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
|
||||
queue->next = 0;
|
||||
res->comm = comm;
|
||||
pthread_mutex_init(&res->threadLock, NULL);
|
||||
@@ -383,7 +389,7 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
|
||||
r->used = 1;
|
||||
*req = r;
|
||||
pthread_mutex_lock(&res->threadLock);
|
||||
queue->next = (queue->next+1)%MAX_QUEUE_LEN;
|
||||
queue->next = (queue->next+1)%queue->len;
|
||||
res->state = start;
|
||||
pthread_cond_signal(&res->threadCond);
|
||||
pthread_mutex_unlock(&res->threadLock);
|
||||
@@ -421,6 +427,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
|
||||
// divide into subtasks
|
||||
int chunkOffset = 0, i = 0;
|
||||
if (r->comm->nSocks > 0) {
|
||||
// each request can be divided up to nSocks tasks
|
||||
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
|
||||
while (chunkOffset < r->size) {
|
||||
int chunkSize = std::min(taskSize, r->size-chunkOffset);
|
||||
@@ -478,7 +485,7 @@ ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
|
||||
ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
|
||||
// We don't support CUDA pointers, so we don't need a flush operation
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -527,7 +534,7 @@ ncclNet_t ncclNetSocket = {
|
||||
ncclSocketDeregMr,
|
||||
ncclSocketIsend,
|
||||
ncclSocketIrecv,
|
||||
ncclSocketFlush,
|
||||
ncclSocketIflush,
|
||||
ncclSocketTest,
|
||||
ncclSocketClose,
|
||||
ncclSocketClose,
|
||||
|
||||
@@ -12,26 +12,30 @@
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#endif
|
||||
#include "shm.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
struct p2pConnectInfo {
|
||||
int direct;
|
||||
int rank;
|
||||
int read;
|
||||
union {
|
||||
void* directPtr;
|
||||
hipIpcMemHandle_t devIpc;
|
||||
};
|
||||
void* directPtr;
|
||||
hipIpcMemHandle_t devIpc;
|
||||
};
|
||||
|
||||
struct p2pSendResources {
|
||||
struct ncclSendMem* devMem;
|
||||
void* ipcPtr;
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
int remoteId;
|
||||
int memRank;
|
||||
void* bootstrap;
|
||||
};
|
||||
|
||||
struct p2pRecvResources {
|
||||
struct ncclRecvMem* devMem;
|
||||
void* ipcPtr;
|
||||
int remoteId;
|
||||
int memRank;
|
||||
void* bootstrap;
|
||||
};
|
||||
|
||||
#include <sys/types.h>
|
||||
@@ -69,9 +73,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
}
|
||||
|
||||
// Check topology / p2p level.
|
||||
int read;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
|
||||
int intermediateRank;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
if (intermediateRank != -1) return ncclSuccess;
|
||||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
@@ -114,31 +119,52 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
// Setting this to non zero causes P2P to use Reads rather than Writes
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
|
||||
static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
int readEnable = ncclParamP2pReadEnable();
|
||||
if (readEnable != -2) return readEnable;
|
||||
|
||||
int p2p, read;
|
||||
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
||||
int p2p;
|
||||
// Queries the topology to see if the GPUs are Ampere and
|
||||
// connected via NVLink, if so we enable P2P Read by default
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
|
||||
|
||||
return read;
|
||||
int readEnable = ncclParamP2pReadEnable();
|
||||
if (readEnable != -2) *read = readEnable;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
||||
// Enable P2P access
|
||||
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||
if (err == hipErrorPeerAccessAlreadyEnabled) {
|
||||
hipGetLastError();
|
||||
} else if (err != hipSuccess) {
|
||||
WARN("failed to peer with device %d(=%lx): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
*devMem = p2pInfo->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
CUDACHECK(hipIpcOpenMemHandle(devMem, p2pInfo->devIpc, hipIpcMemLazyEnablePeerAccess));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Send: Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
|
||||
struct p2pSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
int useRead = p2pUseRead(topo, myInfo, peerInfo);
|
||||
int useRead, intermediateRank;
|
||||
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
|
||||
if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
|
||||
|
||||
resources->next_hdp_reg = 0;
|
||||
uint32_t linktype, hops;
|
||||
@@ -154,116 +180,84 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
struct p2pConnectInfo info;
|
||||
info.read = useRead;
|
||||
const char* useReadStr = info.read ? "/read" : "";
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
info.directPtr = resources->devMem;
|
||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
|
||||
channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
|
||||
return ncclInternalError;
|
||||
|
||||
resources->remoteId = -1;
|
||||
resources->bootstrap = comm->bootstrap;
|
||||
if (intermediateRank == -1) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true));
|
||||
info.rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
} else {
|
||||
// Enable P2P access
|
||||
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||
if (err == hipErrorPeerAccessAlreadyEnabled) {
|
||||
hipGetLastError();
|
||||
} else if (err != hipSuccess) {
|
||||
WARN("failed to peer with device %d(=%lx): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
||||
info.direct = 0;
|
||||
// Map IPC and enable P2P access
|
||||
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
|
||||
if (err != hipSuccess) {
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
|
||||
info.rank = intermediateRank;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr);
|
||||
}
|
||||
resources->memRank = info.rank;
|
||||
|
||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
|
||||
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
|
||||
|
||||
struct p2pRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
int useRead = p2pUseRead(topo, myInfo, peerInfo);
|
||||
int useRead, intermediateRank;
|
||||
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
|
||||
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
|
||||
|
||||
struct p2pConnectInfo info;
|
||||
info.read = useRead;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
info.directPtr = resources->devMem;
|
||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
|
||||
|
||||
resources->remoteId = -1;
|
||||
resources->bootstrap = comm->bootstrap;
|
||||
if (intermediateRank == -1) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true));
|
||||
info.rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
|
||||
} else {
|
||||
// Enable P2P access
|
||||
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
|
||||
if (err == hipErrorPeerAccessAlreadyEnabled) {
|
||||
hipGetLastError();
|
||||
} else if (err != hipSuccess) {
|
||||
WARN("failed to peer with device %d(=%lx): %d %s",
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
||||
info.direct = 0;
|
||||
// Map IPC and enable P2P access
|
||||
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
|
||||
if (err != hipSuccess) {
|
||||
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
|
||||
info.rank = intermediateRank;
|
||||
}
|
||||
resources->memRank = info.rank;
|
||||
|
||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
|
||||
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Connect/Send to this peer */
|
||||
static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
|
||||
struct ncclRecvMem* remDevMem;
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
if (info->direct) {
|
||||
remDevMem = (struct ncclRecvMem*)(info->directPtr);
|
||||
if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
|
||||
} else {
|
||||
//TRACE_DUMP_IPC(&info->devIpc);
|
||||
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
|
||||
remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
|
||||
if (err != hipSuccess) {
|
||||
WARN("failed to open CUDA IPC handle : %d %s",
|
||||
err, hipGetErrorString(err));
|
||||
return ncclUnhandledCudaError;
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
|
||||
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -283,26 +277,12 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
|
||||
}
|
||||
|
||||
/* Connect/Recv from this peer */
|
||||
ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
|
||||
struct ncclSendMem* remDevMem;
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
if (info->direct) {
|
||||
remDevMem = (struct ncclSendMem*)(info->directPtr);
|
||||
if (info->read == 0) {
|
||||
recv->conn.direct |= NCCL_DIRECT_GPU;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
}
|
||||
} else {
|
||||
//TRACE_DUMP_IPC(&info->devIpc);
|
||||
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
|
||||
remDevMem = (struct ncclSendMem*)resources->ipcPtr;
|
||||
if (err != hipSuccess) {
|
||||
WARN("failed to open CUDA IPC handle : %d %s",
|
||||
err, hipGetErrorString(err));
|
||||
return ncclUnhandledCudaError;
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
|
||||
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -316,6 +296,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
}
|
||||
recv->conn.tail = &resources->devMem->tail;
|
||||
recv->conn.head = &remDevMem->head;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -323,6 +304,10 @@ ncclResult_t p2pSendFree(void* resources) {
|
||||
struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
|
||||
if (sendRes->ipcPtr)
|
||||
CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
|
||||
if (sendRes->remoteId != -1) {
|
||||
NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
|
||||
sendRes->devMem = NULL;
|
||||
}
|
||||
CUDACHECK(hipFree(sendRes->devMem));
|
||||
free(sendRes);
|
||||
return ncclSuccess;
|
||||
@@ -332,6 +317,10 @@ ncclResult_t p2pRecvFree(void* resources) {
|
||||
struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
|
||||
if (recvRes->ipcPtr)
|
||||
CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
|
||||
if (recvRes->remoteId != -1) {
|
||||
NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
|
||||
recvRes->devMem = NULL;
|
||||
}
|
||||
CUDACHECK(hipFree(recvRes->devMem));
|
||||
free(recvRes);
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
|
||||
struct shmSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -81,7 +81,7 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct shmRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
@@ -106,7 +106,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
}
|
||||
|
||||
/* Connect to this peer */
|
||||
ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
|
||||
@@ -131,7 +131,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
// Setup device pointers
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
|
||||
@@ -65,13 +65,25 @@ BEGIN {
|
||||
do {
|
||||
match($col_1, /\[([0-9]+)\]/, ary)
|
||||
chan=strtonum(ary[1])
|
||||
match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
|
||||
if(ary[8]!="-1")
|
||||
treedns[ary[7] "," ary[8] "," chan]="1"
|
||||
if(ary[9]!="-1")
|
||||
treedns[ary[7] "," ary[9] "," chan]="1"
|
||||
if(ary[10]!="-1")
|
||||
treedns[ary[7] "," ary[10] "," chan]="1"
|
||||
where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
|
||||
if(where != 0) {
|
||||
if(ary[8]!="-1")
|
||||
treedns[ary[7] "," ary[8] "," chan]="1"
|
||||
if(ary[9]!="-1")
|
||||
treedns[ary[7] "," ary[9] "," chan]="1"
|
||||
if(ary[10]!="-1")
|
||||
treedns[ary[7] "," ary[10] "," chan]="1"
|
||||
} else {
|
||||
where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)/, ary)
|
||||
if(where != 0) {
|
||||
if(ary[1]!="-1")
|
||||
treedns[ary[4] "," ary[1] "," chan]="1"
|
||||
if(ary[2]!="-1")
|
||||
treedns[ary[4] "," ary[2] "," chan]="1"
|
||||
if(ary[3]!="-1")
|
||||
treedns[ary[4] "," ary[3] "," chan]="1"
|
||||
}
|
||||
}
|
||||
if(chan>max_treedn)
|
||||
max_treedn=chan
|
||||
col_1=col_1+2
|
||||
|
||||
@@ -31,6 +31,32 @@
|
||||
#include <list>
|
||||
#include <iterator>
|
||||
|
||||
struct ibtestProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
uint64_t head;
|
||||
uint64_t tail;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
int idle;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ibtestProxyArgs* next;
|
||||
struct ibtestProxyArgs* nextPeer;
|
||||
};
|
||||
|
||||
ncclResult_t initNet();
|
||||
|
||||
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
|
||||
@@ -204,7 +230,7 @@ private:
|
||||
bool runSend;
|
||||
bool use_gdr_read;
|
||||
int sliceSteps;
|
||||
struct ncclProxyArgs args;
|
||||
struct ibtestProxyArgs args;
|
||||
|
||||
ncclResult_t connect(char* ip, uint16_t port) {
|
||||
inet_pton(AF_INET, ip, &netConnectAddr.sin_addr);
|
||||
@@ -326,7 +352,7 @@ public:
|
||||
void launchKernel(uint64_t end) {
|
||||
*sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
|
||||
send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
|
||||
memset(&args, 0, sizeof(struct ncclProxyArgs));
|
||||
memset(&args, 0, sizeof(struct ibtestProxyArgs));
|
||||
args.head = 0;
|
||||
args.tail = 0;
|
||||
args.end = end;
|
||||
@@ -365,7 +391,7 @@ private:
|
||||
bool runRecv;
|
||||
bool use_gdr_write;
|
||||
int sliceSteps;
|
||||
struct ncclProxyArgs args;
|
||||
struct ibtestProxyArgs args;
|
||||
|
||||
ncclResult_t listen() {
|
||||
printf("GDR Write %s\n", use_gdr_write ? "enabled" : "disabled");
|
||||
@@ -472,7 +498,7 @@ public:
|
||||
}
|
||||
args.head += args.sliceSteps;
|
||||
recv_byte += size;
|
||||
NCCLCHECK(ncclNetFlush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
|
||||
NCCLCHECK(ncclNetIflush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle, args.requests+buffSlot));
|
||||
STORE(recvHead, args.head);
|
||||
args.idle = 0;
|
||||
}
|
||||
@@ -486,7 +512,7 @@ public:
|
||||
void launchKernel(uint64_t end) {
|
||||
*recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
|
||||
recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
|
||||
memset(&args, 0, sizeof(struct ncclProxyArgs));
|
||||
memset(&args, 0, sizeof(struct ibtestProxyArgs));
|
||||
args.head = 0;
|
||||
args.tail = 0;
|
||||
args.end = end;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user