Merge remote-tracking branch 'nccl/master' into no-target-id

[ROCm/rccl commit: d469947641]
This commit is contained in:
Wenkai Du
2020-12-01 11:33:47 -05:00
106 zmienionych plików z 11943 dodań i 4104 usunięć
-4
Wyświetl plik
@@ -89,10 +89,6 @@ set(CU_SOURCES
src/collectives/device/broadcast.cu
src/collectives/device/reduce_scatter.cu
src/collectives/device/sendrecv.cu
src/collectives/device/gather.cu
src/collectives/device/scatter.cu
src/collectives/device/all_to_all.cu
src/collectives/device/all_to_allv.cu
src/collectives/device/functions.cu)
set(CPP_SOURCES)
+5
Wyświetl plik
@@ -11,6 +11,7 @@ KEEP ?= 0
DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 0
NVTX ?= 1
NVCC = $(CUDA_HOME)/bin/nvcc
@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
CXXFLAGS += -DENABLE_TRACE
endif
ifeq ($(NVTX), 0)
CXXFLAGS += -DNVTX_DISABLE
endif
ifneq ($(KEEP), 0)
NVCUFLAGS += -keep
endif
+2 -2
Wyświetl plik
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 7
NCCL_PATCH := 8
NCCL_MINOR := 8
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
+2 -2
Wyświetl plik
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
Section: libs
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: NVIDIA Collectives Communication Library (NCCL) Runtime
Description: NVIDIA Collective Communication Library (NCCL) Runtime
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
@@ -21,7 +21,7 @@ Package: libnccl-dev
Section: libdevel
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
Description: NVIDIA Collectives Communication Library (NCCL) Development Files
Description: NVIDIA Collective Communication Library (NCCL) Development Files
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
+3 -3
Wyświetl plik
@@ -1,7 +1,7 @@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
License: BSD
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
%package devel
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description devel
NCCL development files
%package static
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description static
NCCL static library
+261 -199
Wyświetl plik
@@ -13,144 +13,77 @@
#include <unistd.h>
#include <sys/types.h>
struct bootstrapNetComm {
int fd;
};
/* Init functions */
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
static int bootstrapNetIfs = -1;
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
static union socketAddress bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t bootstrapNetInit() {
if (bootstrapNetIfs == -1) {
if (bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
if (bootstrapNetIfs == -1) {
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
if (bootstrapNetIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
} else {
char line[1024];
char addrline[1024];
line[0] = '\0';
for (int i=0; i<bootstrapNetIfs; i++) {
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
if (bootstrapNetInitDone == 0) {
char* env = getenv("NCCL_COMM_ID");
if (env) {
union socketAddress remoteAddr;
if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
} else {
int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
}
line[1023] = '\0';
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
}
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
sprintf(line, " %s:", bootstrapNetIfName);
socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
bootstrapNetInitDone = 1;
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
(*comm)->fd = -1;
return ncclSuccess;
}
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
if (dev >= bootstrapNetIfs) return ncclInternalError;
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
return ncclSuccess;
}
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
// if dev >= 0, listen based on dev
if (dev >= 0) {
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
} else if (dev == findSubnetIf) {
// handle stores a remote address
// need to find a local addr that is in the same network as the remote addr
union socketAddress localAddr;
char ifName[MAX_IF_NAME_SIZE];
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
// pass the local address back
memcpy(connectAddr, &localAddr, sizeof(localAddr));
} // Otherwise, handle stores a local address
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
*listenComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
*sendComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
struct bootstrapNetComm* rComm;
NCCLCHECK(bootstrapNetNewComm(&rComm));
static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
struct sockaddr_in sockaddr;
socklen_t socklen = sizeof(struct sockaddr_in);
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
*recvComm = rComm;
SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
return ncclSuccess;
}
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
if (comm) {
close(comm->fd);
free(comm);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
// Additional sync functions
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
NCCLCHECK(socketSend(comm->fd, data, size));
static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
NCCLCHECK(socketSend(fd, &size, sizeof(int)));
NCCLCHECK(socketSend(fd, data, size));
return ncclSuccess;
}
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
int recvSize;
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
return ncclInternalError;
}
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
struct extInfo {
int rank;
int nranks;
ncclNetHandle_t extHandleListenRoot;
ncclNetHandle_t extHandleListen;
union socketAddress extAddressListenRoot;
union socketAddress extAddressListen;
};
#include <sys/resource.h>
@@ -163,27 +96,29 @@ static ncclResult_t setFilesLimit() {
return ncclSuccess;
}
static void *bootstrapRoot(void* listenComm) {
static void *bootstrapRoot(void* args) {
int listenFd = (uint64_t)args;
ncclResult_t res = ncclSuccess;
int nranks = 0, c = 0;
struct extInfo info;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
void* tmpComm;
ncclResult_t res;
union socketAddress *rankAddresses = NULL;
union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
union socketAddress *zero = NULL;
NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
setFilesLimit();
TRACE(NCCL_INIT, "BEGIN");
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
int tmpFd;
NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
close(tmpFd);
if (c == 0) {
nranks = info.nranks;
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
}
if (nranks != info.nranks) {
@@ -191,14 +126,14 @@ static void *bootstrapRoot(void* listenComm) {
goto out;
}
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
// Save the connection handle for that rank
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
++c;
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
@@ -208,44 +143,46 @@ static void *bootstrapRoot(void* listenComm) {
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
int tmpSendFd;
NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
close(tmpSendFd);
}
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
bootstrapNetCloseListen(listenComm);
if (rankHandles) free(rankHandles);
if (rankHandlesRoot) free(rankHandlesRoot);
close(listenFd);
if (rankAddresses) free(rankAddresses);
if (rankAddressesRoot) free(rankAddressesRoot);
if (zero) free(zero);
TRACE(NCCL_INIT, "DONE");
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
void* listenComm;
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
union socketAddress* connectAddr = (union socketAddress*) id;
int listenFd;
NCCLCHECK(createListenSocket(&listenFd, connectAddr));
pthread_t thread;
pthread_create(&thread, NULL, bootstrapRoot, listenComm);
pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
memset(id, 0, sizeof(ncclUniqueId));
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
union socketAddress* connectAddr = (union socketAddress*) id;
char* env = getenv("NCCL_COMM_ID");
if (env) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
} else {
memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(bootstrapCreateRoot(id, false));
}
@@ -254,24 +191,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
struct unexConn {
int peer;
void* comm;
int fd;
struct unexConn* next;
};
struct extState {
void* extBstrapListenComm;
void* extBstrapRingRecvComm;
void* extBstrapRingSendComm;
ncclNetHandle_t* peerBstrapHandles;
struct unexConn* unexpectedConnections;
int rank;
int nranks;
int dev;
// Remote allocator state
struct remAllocState {
int cudaDev;
int listenFd;
int stop;
};
struct extState {
int extListenFd;
int extRingRecvFd;
int extRingSendFd;
union socketAddress* peerCommAddresses;
union socketAddress* peerAllocAddresses;
struct unexConn* unexpectedConnections;
int cudaDev;
int rank;
int nranks;
// Intermediate memory allocation service
struct remAllocState* allocState;
pthread_t allocThread;
};
#define MAX_SEGMENTS 128
static ncclResult_t remoteAlloc(void** ptr, int fd) {
size_t size;
NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
hipIpcMemHandle_t devIpc;
NCCLCHECK(ncclCudaCalloc((char**)ptr, size, true));
hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
if (res != hipSuccess) {
WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
hipFree(*ptr);
CUDACHECK(res);
}
// The CUDA IPC
NCCLCHECK(socketSend(fd, &devIpc, sizeof(hipIpcMemHandle_t)));
// And the direct pointer
NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
return ncclSuccess;
}
#include <poll.h>
// Service thread to allocate memory for other GPUs, used as intermediate step.
void* ncclRemoteMemAllocationService(void* args) {
struct remAllocState* state = (struct remAllocState *) args;
if (hipSetDevice(state->cudaDev) != hipSuccess) {
WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
}
// Prepare poll descriptor
void* segments[MAX_SEGMENTS];
struct pollfd pollfds[MAX_SEGMENTS+1];
for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
for (int s=0; s<MAX_SEGMENTS; s++) {
pollfds[s].fd = -1;
pollfds[s].events = POLLHUP;
}
pollfds[MAX_SEGMENTS].fd = state->listenFd;
pollfds[MAX_SEGMENTS].events = POLLIN;
int nbuffers = 0;
while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
WARN("[Rem Allocator] Poll failed with error %d", error);
return NULL;
}
if (pollfds[MAX_SEGMENTS].revents) {
int s = 0;
while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
pollfds[s].fd = -1;
} else {
if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
close(pollfds[s].fd);
pollfds[s].fd = -1;
} else {
nbuffers++;
}
}
}
for (int s=0; s<MAX_SEGMENTS; s++) {
if (pollfds[s].revents & POLLHUP) {
if (hipFree(segments[s]) != hipSuccess) {
WARN("[Rem Allocator] hipFree %p failed", segments[s]);
}
segments[s] = NULL;
close(pollfds[s].fd);
pollfds[s].fd = -1;
nbuffers--;
}
}
}
for (int s=0; s<MAX_SEGMENTS; s++) {
if (segments[s]) hipFree(segments[s]);
close(pollfds[s].fd);
}
close(state->listenFd);
free(state);
return NULL;
}
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
struct extState* state = (struct extState*)commState;
int fd;
ncclResult_t res;
*id = -1;
NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(hipIpcMemHandle_t)), res, end);
NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
*id = fd;
end:
return res;
}
ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
SYSCHECK(close(id), "close");
return ncclSuccess;
}
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
@@ -283,19 +331,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
void* extBstrapListenCommRoot;
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
int tmpSendFd, tmpRecvFd;
// stagger connection times to avoid an overload of the root at very high rank counts
int extListenFdRoot;
memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress));
memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
// stagger connection times to avoid an overload of the root
if (nranks > 128) {
long msec = rank;
struct timespec tv;
@@ -306,25 +350,35 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
}
// send info on my listening socket to root
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
union socketAddress* rootAddr = (union socketAddress*)id;
NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
close(tmpSendFd);
// get info on my "next" rank in the bootstrap ring from root
ncclNetHandle_t extHandleNext;
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
union socketAddress extAddressNext;
NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
close(tmpRecvFd);
close(extListenFdRoot);
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));
// AllGather all listen handlers
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
// Create the memory allocation service
NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(ncclCalloc(&state->allocState, 1));
CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@@ -348,9 +402,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
// Recv slice from the left
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -359,20 +413,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpSendComm;
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
int tmpSendFd;
NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
close(tmpSendFd);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
// New unex
struct unexConn* unex;
NCCLCHECK(ncclCalloc(&unex, 1));
unex->peer = peer;
unex->comm = comm;
unex->fd = fd;
// Enqueue
struct unexConn* list = state->unexpectedConnections;
@@ -385,7 +439,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
return ncclSuccess;
}
void* unexpectedDequeue(struct extState* state, int peer) {
int unexpectedDequeue(struct extState* state, int peer) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
@@ -395,41 +449,41 @@ void* unexpectedDequeue(struct extState* state, int peer) {
} else {
prev->next = elem->next;
}
void* comm = elem->comm;
int fd = elem->fd;
free(elem);
return comm;
return fd;
}
prev = elem;
elem = elem->next;
}
return NULL;
return -1;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpRecvComm;
int tmpRecvFd;
// Search unexpected connections first
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
}
// Then look for new connections
while (1) {
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
int newPeer;
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
if (newPeer == peer) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
}
// Unexpected connection. Save for later.
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
}
}
@@ -439,11 +493,17 @@ ncclResult_t bootstrapClose(void* commState) {
WARN("Unexpected connections are not empty.\n");
return ncclInternalError;
}
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
close(state->extListenFd);
close(state->extRingSendFd);
close(state->extRingRecvFd);
free(state->peerBstrapHandles);
state->allocState->stop = 1;
// Join the allocThread so we catch resource leaks as being hung here
// pthread_join(state->allocThread, nullptr);
free(state->peerCommAddresses);
free(state->peerAllocAddresses);
free(state);
return ncclSuccess;
@@ -451,10 +511,12 @@ ncclResult_t bootstrapClose(void* commState) {
ncclResult_t bootstrapAbort(void* commState) {
struct extState* state = (struct extState*)commState;
bootstrapNetCloseListen(state->extBstrapListenComm);
bootstrapNetCloseSend(state->extBstrapRingSendComm);
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
free(state->peerBstrapHandles);
close(state->extListenFd);
close(state->extRingSendFd);
close(state->extRingRecvFd);
state->allocState->stop = 2;
free(state->peerCommAddresses);
free(state->peerAllocAddresses);
free(state);
return ncclSuccess;
}
+2 -4
Wyświetl plik
@@ -26,16 +26,14 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
}
// Per-channel operation list.
NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
NCCLCHECK(ncclCudaHostCalloc(&channel->collectivesExtra, comm->nRanks*NCCL_MAX_OPS*4));
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (channel->id == -1) return ncclSuccess;
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->collectivesExtra));
NCCLCHECK(ncclCudaHostFree(channel->collectives));
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
// Free Ring index to rank tables
free(channel->ring.userRanks);
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllGather, "AllGather",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
@@ -25,9 +25,10 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
} else {
struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
return ncclEnqueueCheck(&info);
//struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
// sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
// ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
//return ncclEnqueueCheck(&info);
return ncclInternalError;
}
}
@@ -37,9 +37,10 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
} else {
struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
return ncclEnqueueCheck(&info);
//struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
// sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
// ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
//return ncclEnqueueCheck(&info);
return ncclInternalError;
}
}
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
IMPL_COLL_C(AllGather);
@@ -9,206 +9,201 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
if (thisInput + chunkOffset == thisOutput + offset) { // In place
prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Final wait/copy.
prims.directRecv(thisOutput+offset, offset, nelem);
}
}
};
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Final wait/copy.
prims.directRecv(thisOutput+offset, offset, nelem);
}
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
};
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
template<int PROTO, class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
LLprims.recvCopySend(thisOutput+offset, nelem);
}
template<int PROTO, class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
IMPL_COLL_R(AllReduce);
Plik diff jest za duży Load Diff
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
IMPL_COLL_C(Broadcast);
@@ -9,177 +9,155 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __rtc64();
#endif
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
} else {
INIT_COUNTER;
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
ACCUMULATE_COUNTER(copySend);
if (rank == root) {
if (thisInput == thisOutput) {
prims.send(thisInput+offset, nelem);
} else {
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
}
} else if (nextRank == root) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
} else if (nextRank == root) {
INIT_COUNTER;
prims.recv(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recv);
} else {
INIT_COUNTER;
prims.recvCopySend(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recvCopySend);
}
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
#endif
}
};
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
+226 -224
Wyświetl plik
@@ -11,122 +11,95 @@
#include "collectives.h"
#include "devcomm.h"
__device__
inline __attribute((always_inline))
long long int __rtc64() {
#if __HIP__
return (long long int) __builtin_amdgcn_s_memrealtime();
#else
return (long long int) __clock_u64();
#endif
}
#define COLL_UNROLL 2
#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define exitIfAbortBarrier(abort, abortCount) \
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
__syncthreads(); \
if (LOAD(abortCount)) { /*asm volatile ("s_endpgm");*/ return false; }
#define __syncwarp()
#else
static inline __device__ void exitIfAbortBarrier(int abort) {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
asm ("}");
if (popc) { asm volatile ("exit;"); }
}
#endif
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC5(func, algo, redop, type) \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64), \
NCCL_FUNC4(coll, op, b16)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double), \
NCCL_FUNC4(func, redop, rccl_bfloat16)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum ), \
NCCL_FUNCS3A(func, Prod), \
NCCL_FUNCS3A(func, Max ), \
NCCL_FUNCS3A(func, Min )
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce), \
NCCL_COLL_NAME(ncclGather, copy, i8), \
NCCL_COLL_NAME(ncclScatter, copy, i8), \
NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
NCCL_COLL_NAME(ncclAllToAllv, copy, i8), \
NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
NCCL_FUNCS2B(Broadcast), \
NCCL_FUNCS2A(Reduce), \
NCCL_FUNCS2B(AllGather), \
NCCL_FUNCS2A(ReduceScatter), \
NCCL_FUNCS2A(AllReduce), \
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) }
// Must be consistent with the ncclFuncSet enum
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if defined(__HIP_DEVICE_COMPILE__)
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce),
NCCL_COLL_NAME(ncclGather, copy, i8),
NCCL_COLL_NAME(ncclScatter, copy, i8),
NCCL_COLL_NAME(ncclAllToAll, copy, i8),
NCCL_COLL_NAME(ncclAllToAllv, copy, i8),
NCCL_COLL_NAME(ncclSendRecv, copy, i8)
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce),
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
#endif
};
template<unsigned short f, unsigned short l>
struct Caller {
static __device__ __host__
void call(ncclColl* const c) noexcept
void call(struct ncclWorkElem* const c) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
@@ -137,78 +110,72 @@ struct Caller {
template<unsigned short f>
struct Caller<f, f + 1>{
static __device__ __host__
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
};
inline
__device__
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
if (c->funcIndex < 360) {
if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
else ncclBroadcastCollNet_copy_i8(&c->args);
if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
}
else if (c->funcIndex < 720) Caller<360, 720>::call(c);
else if (c->funcIndex < 1080) {
if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
else ncclAllGatherCollNet_copy_i8(&c->args);
if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
}
else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
else if (c->funcIndex == 1800) {
ncclGather_copy_i8(&c->args);
}
else if (c->funcIndex == 1801) {
ncclScatter_copy_i8(&c->args);
}
else if (c->funcIndex == 1802) {
ncclAllToAll_copy_i8(&c->args);
}
else if (c->funcIndex == 1803) {
ncclAllToAllv_copy_i8(&c->args);
}
else ncclSendRecv_copy_i8(&c->args);
else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
}
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
}
static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? *(comm->abortFlag) : 0;
exitIfAbortBarrier(abort, abortCount);
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
__syncthreads();
if (tid == 0) hostColl->active = 0;
load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
exitIfAbortBarrier(abort, abortCount);
if (tid == 0) hostWork->elems[0].active = 0;
return true;
}
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction {
public:
__device__ void run(struct ncclWorkElem* args) {}
};
#ifdef ENABLE_COLLTRACE
#define traceColl(fIdx) \
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
comm->collTrace[pos].timeStamp = __rtc64(); \
comm->collTrace[pos].opCount = localColl.args.opCount; \
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
comm->collTrace[pos].opCount = w->opCount; \
comm->collTrace[pos].bid = bid; \
comm->collTrace[pos].funcIndex = fIdx;
#define traceKernelLaunch(fIdx) { \
traceColl(fIdx); \
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
}
#define traceCollEnd(fIdx) { \
traceColl(fIdx); \
@@ -218,124 +185,159 @@ static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* ho
traceColl(fIdx); \
comm->collTrace[pos].type = ncclCollTraceAbortType; \
}
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
#define traceData(data2, data4, data8_0, data8_1) { \
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
comm->collTrace[pos].bid = blockIdx.x; \
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
comm->collTrace[pos].funcIndex = data2; \
comm->collTrace[pos].data_0 = data4; \
comm->collTrace[pos].opCount = data8_0; \
comm->collTrace[pos].data_1 = data8_1; \
comm->collTrace[pos].type = ncclCollTraceDataType; \
}
#else
#define traceKernelLaunch()
#define traceCollEnd()
#define traceAbort()
#define traceKernelLaunch(fIdx)
#define traceCollEnd(fIdx)
#define traceAbort(fIdx)
#define traceData(data2, data4, data8_0, data8_1)
#endif
extern __device__ volatile uint64_t* ncclShmem;
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclShmemPtrs {
void* srcs[NCCL_MAX_DEV_ARITY+1];
void* dsts[NCCL_MAX_DEV_ARITY+1];
uint64_t barrier;
uint64_t barrier_next[MAXWARPS];
};
struct ncclShmemData {
union {
#ifdef ENABLE_LL128
#define ALLOCATE_SHMEM \
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
ncclShmem = shmem; \
__shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE];
volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
#else
#define ALLOCATE_SHMEM \
uint32_t* sync = 0;
volatile uint64_t* data;
#endif
struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
};
uint32_t sync[MAXWARPS];
struct ncclWork localWork;
};
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
extern __device__ struct ncclShmemData *ncclShmem;
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX, bool COLLTRACE>
__device__ void ncclKernel(struct ncclWorkElem first) {
int tid = threadIdx.x;
int bid = blockIdx.x;
__shared__ struct ncclShmemData shmem;
ncclShmem = &shmem;
__shared__ uint32_t abortCount;
if (tid == 0) {
abortCount = 0;
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
shmem.ptrs[i].barrier = 0;
for (auto j = 0; j < MAXWARPS; j++) shmem.ptrs[i].barrier_next[j] = 0;
}
}
__syncthreads();
auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
struct ncclDevComm* comm = first.comm;
struct ncclChannel* channel = comm->channels+bid;
struct ncclWorkElem* w = NULL;
uint16_t index = first.index;
bool firstLaunch = true;
if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
while (1) {
if (w == NULL) {
w = shmem.localWork.elems;
if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
if (COLLTRACE && tid == 0) traceAbort(-1);
return;
}
if (COLLTRACE && tid == 0) {
if (firstLaunch) traceKernelLaunch(w->funcIndex);
if (!firstLaunch) traceCollEnd(w->funcIndex);
firstLaunch = false;
}
} else {
if (COLLTRACE && tid == 0) {
traceKernelLaunch(w->funcIndex);
firstLaunch = false;
}
}
if (tid < w->nThreads) {
if (w->funcIndex == FINDEX) {
f.run(w);
} else {
NCCL_CALL_FUNCTIONS(w);
}
}
index = (index+1) % NCCL_MAX_OPS;
if (w->active == 2) {
if (COLLTRACE && tid == 0) traceCollEnd(-1);
return;
}
w = NULL;
}
}
/* Kernels with the first operation inlined */
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
ALLOCATE_SHMEM; \
__shared__ struct ncclColl localColl; \
__shared__ uint32_t abortCount; \
__shared__ uint64_t barrier[MAXBARRIERS]; \
__shared__ uint64_t barrier_next[MAXBARRIERS*MAXWARPS]; \
if (tid == 0) abortCount = 0; \
__syncthreads(); \
\
struct ncclChannel* channel = comm->channels+bid; \
if (tid == 0) { \
channel->sync = sync; \
channel->barrier = barrier; \
channel->barrier_next = barrier_next; \
for (auto i = 0; i < MAXBARRIERS; i++) barrier[i] = 0; \
for (auto i = 0; i < MAXBARRIERS*MAXWARPS; i++) barrier_next[i] = 0; \
} \
if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
if (tid == 0) traceAbort(-1); \
return; \
} \
if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
while (1) { \
if (tid < localColl.args.common.nThreads) { \
if (localColl.funcIndex == fIndex) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
} else { \
NCCL_CALL_FUNCTIONS(&localColl); \
} \
} \
int nextIndex = localColl.nextIndex; \
if (tid == 0) channel->collFifoHead = nextIndex; \
\
if (localColl.active == 2) { \
if (tid == 0) traceCollEnd(-1); \
return; \
} \
\
/* Load next collective operation*/ \
if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
if (tid == 0) traceAbort(-1); \
break; \
} \
if (tid == 0) traceCollEnd(localColl.funcIndex); \
} \
__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
if (first.comm->collTraceThread) \
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, true>(first); \
else \
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, false>(first); \
}
#define IMPL_COLL_KERN_sum(coll, op, ncclFunc, dtype, ctype, fIndex) \
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_copy(coll, op, ncclFunc, dtype, ctype, fIndex) \
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_prod(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_min(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_max(coll, op, ncclFunc, dtype, ctype, fIndex)
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
f.run(args); \
}
// Only generate inline kernels for LL
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
#define IMPL_COLL4(func, algo, redop, type, ncclType) \
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
#define IMPL_COLL3(func, redop, type, ncclType) \
IMPL_COLL4(func, TREE, redop, type, ncclType) \
IMPL_COLL4(func, RING, redop, type, ncclType) \
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) \
IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
#define IMPL_COLL2(func, redop) \
IMPL_COLL3(func, redop, int8_t, ncclInt8) \
IMPL_COLL3(func, redop, uint8_t, ncclUint8) \
IMPL_COLL3(func, redop, int32_t, ncclInt32) \
IMPL_COLL3(func, redop, uint32_t, ncclUint32) \
IMPL_COLL3(func, redop, int64_t, ncclInt64) \
IMPL_COLL3(func, redop, uint64_t, ncclUint64) \
IMPL_COLL3(func, redop, half, ncclFloat16) \
IMPL_COLL3(func, redop, float, ncclFloat32) \
IMPL_COLL3(func, redop, double, ncclFloat64) \
IMPL_COLL3(func, redop, rccl_bfloat16, ncclBfloat16)
// Reduction define all functions
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); \
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); \
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
#define IMPL_COLL_R(func) \
IMPL_COLL2(func, Sum) \
IMPL_COLL2(func, Prod) \
IMPL_COLL2(func, Min) \
IMPL_COLL2(func, Max)
// Copy primitives only define one
#define IMPL_COLL_C(collf, colln) \
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
// Copy primitives only define one function for copy
#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
#define COLL_UNROLL 2
// Point-to-point primitives only have one function/kernel.
#define IMPL_COLL_P(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
#endif
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -282,28 +282,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
#endif
}
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
T val = vFetch(srcs[0]+idx);
#pragma unroll
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
const T* srcs[MAXSRCS];
for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
T* dsts[MAXDSTS];
for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
while (offset < Nelem) {
T vals[UNROLL];
// Load and reduce
for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
#pragma unroll
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
for (int i=1; i<MINSRCS; i++) {
T vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
}
#pragma unroll
for (int i=MINSRCS; i<MAXSRCS; i++) {
if (i<nsrcs) {
T vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
}
}
// Store
#pragma unroll
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll
for (int i=MINDSTS; i<MAXDSTS; i++) {
if (i<ndsts) {
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
}
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
offset += inc;
}
}
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
__device__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
@@ -334,8 +363,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
for (int i=MINDSTS; i<MAXDSTS; i++) {
if (i<ndsts) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
@@ -343,85 +374,73 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
#else
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
#define PACKELEMS (sizeof(Pack128) / sizeof(T))
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// Multiply UNROLL by 2 if single source/single destination
#define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
#endif
// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T** srcs, int ndsts, T** dsts,
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int alignDiff = 0;
int align = ptrAlign128(srcs[0]);
#pragma unroll
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
#pragma unroll
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
int Npreamble = alignDiff ? Nrem :
N < alignof(int32_t) ? N :
(alignof(int32_t) - align) % alignof(int32_t);
#else
int Npreamble = alignDiff ? Nrem :
N < alignof(Pack128) ? N :
(alignof(Pack128) - align) % alignof(Pack128);
#endif
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
if (Npreamble) {
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
}
int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int packFactor = sizeof(Pack128) / sizeof(T);
// Check that all is 16B aligned. If not don't use 16B load/stores.
int align = 0;
#pragma unroll
for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
#pragma unroll
for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
// stage 2a: main loop
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
* (AUTOUNROLL * WARP_SIZE); // round down
int Nelem2a = Npack2a * packFactor;
int offset = 0;
if (align == 0) {
// fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit aligned.
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
// main loop
int Npack = (Nrem / (PACKELEMS*AUTOUNROLL*WARP_SIZE)) * (AUTOUNROLL*WARP_SIZE); // round down
int Nelem = Npack * PACKELEMS;
Nrem -= Nelem2a;
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem;
// slightly less optimized for section when we don't have full unrolling
Npack = Nrem / PACKELEMS;
Nelem = Npack * PACKELEMS;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem;
}
// unrolled, by-type (mostly for unaligned buffers)
int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem2a;
offset += Nelem;
// stage 2b: slightly less optimized for section when we don't have full
// unrolling
int Npack2b = Nrem / packFactor;
int Nelem2b = Npack2b * packFactor;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
Nrem -= Nelem2b;
if (Nrem == 0) return;
offset += Nelem2b;
// stage 2c: tail
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
// no unroll, by type. Should finish what's remaining.
ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
#endif // COMMON_KERNEL_H_
@@ -9,62 +9,62 @@
#include "collectives.h"
#include "common.h"
__device__ volatile uint64_t* ncclShmem;
__device__ struct ncclShmemData* ncclShmem;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL128, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC5(func, algo, redop, type) \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum ), \
NCCL_FUNCS3A(func, Prod), \
NCCL_FUNCS3A(func, Max ), \
NCCL_FUNCS3A(func, Min )
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce) }
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
NCCL_FUNCS2B(Broadcast), \
NCCL_FUNCS2A(Reduce), \
NCCL_FUNCS2B(AllGather), \
NCCL_FUNCS2A(ReduceScatter), \
NCCL_FUNCS2A(AllReduce) }
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
@@ -72,12 +72,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_COLL_NAME(ncclSendRecv, copy, i8),
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce)
#endif
};
#endif
@@ -9,29 +9,9 @@
#define OP128_H_
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
v0=LOAD(ptr);
v1=LOAD(ptr+1);
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
STORE(ptr, v0);
STORE(ptr+1, v1);
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
return (uint64_t*)shmemGenericPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
v0=LOAD(shmemAsmPtr);
v1=LOAD(shmemAsmPtr+1);
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
STORE(shmemAsmPtr, v0);
STORE(shmemAsmPtr+1, v1);
}
#else
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
@@ -32,87 +32,79 @@
} \
} while (0)
#define barrier_by_id(id) do { \
#define barrier_by_group() do { \
const int w = threadIdx.x/WARP_SIZE; \
barrier_next[id*MAXWARPS+w] += nthreads/WARP_SIZE; \
__atomic_fetch_add(barriers+id, 1, __ATOMIC_SEQ_CST); \
while (LOAD(barriers+id) < barrier_next[id*MAXWARPS+w]) /* spin */; \
const int wid = threadIdx.x%WARP_SIZE; \
if (wid == 0) { \
barrier_next[w] += nthreads/WARP_SIZE; \
__atomic_fetch_add(barriers, 1, __ATOMIC_SEQ_CST); \
while (LOAD(barriers) < barrier_next[w]) /* spin */; \
} \
} while (0)
#define ROLE_SRC 0x01
#define ROLE_DST 0x02
#define ROLE_WAIT_RECV 0x04
#define ROLE_WAIT_SEND 0x08
#define ROLE_POST_SEND 0x10
#define ROLE_POST_RECV 0x20
// Implementation of primitive types
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
class ncclPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
int nthreads;
int nworkers;
const int stepSize;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
volatile uint64_t* recvConnTailPtr = NULL;
uint64_t recvConnTail;
uint64_t recvConnTailCache; // Cache last seen value
struct ncclConnInfo* conn = NULL;
volatile int* connSizesFifoPtr = NULL;
void** connPtrsFifoPtr = NULL;
volatile uint64_t* connHeadPtr = NULL;
volatile uint64_t* connTailPtr = NULL;
uint64_t connTailCache; // Cache last seen value
uint64_t connHeadCache; // Cache last seen value
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnTailPtr = NULL;
uint64_t sendConnTail;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
#if defined(RCCL_USE_DIRECT_BUFFER)
const T* recvDirectBuff[NRECV];
T* sendDirectBuff[NSEND];
#endif
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
int index; // Peer index I'm responsible for
int peer = -1;
int role = 0;
int group;
uint64_t step;
T* direct = NULL;
T* buff;
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
const T** srcs;
T** dsts;
uint64_t* barriers;
uint64_t* barrier_next;
// Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (wid == 0) {
if (NRECV < NSEND) barrier_by_id(0);
else barrier_by_id(1);
}
if (nthreads == WARP_SIZE) __syncwarp();
else barrier_by_group();
#else
if (NSEND>NRECV) {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
} else {
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
}
if (nthreads == WARP_SIZE) __syncwarp();
else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
#endif
}
inline __device__ void subBarrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
barrier();
#else
if (NSEND>NRECV) {
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
} else {
asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
}
if (nworkers == nthreads) barrier();
else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
#endif
}
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(int i, int send) {
inline __device__ int checkAbort() {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = LOAD(comm->abortFlag);
@@ -121,90 +113,54 @@ class ncclPrimitives {
return abort;
}
inline __device__ void waitSend(int nbytes) {
spins = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
sendConnHeadCache = LOAD(sendConnHeadPtr);
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
}
sendConnHead += SLICESTEPS;
}
template <int DIRECTPTR>
inline __device__ T* directPtr(ssize_t directOffset) {
return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
}
inline __device__ void waitRecv() {
template <int DST, int DIRECTSEND>
inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
spins = 0;
if (recvConnTailPtr) {
#ifdef ENABLE_PROFILING
uint64_t t0 = __rtc64();
#endif
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
recvConnTailCache = LOAD(recvConnTailPtr);
if (checkAbort(wid, 0)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
#endif
recvConnTail += SLICESTEPS;
while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
connHeadCache = LOAD(connHeadPtr);
if (checkAbort()) break;
}
if (connSizesFifoPtr) {
STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
}
if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
step += SLICESTEPS;
}
inline __device__ void incRecv(int i) {
recvStep[i] += SLICESTEPS;
template <int SRC, int DIRECTRECV>
inline __device__ void waitRecv(ssize_t directOffset) {
spins = 0;
#ifdef ENABLE_PROFILING
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
#endif
while (connTailCache < step + SLICESTEPS) {
connTailCache = LOAD(connTailPtr);
if (checkAbort()) break;
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
#endif
if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
step += SLICESTEPS;
}
inline __device__ void postRecv() {
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
STORE(connHeadPtr, step += SLICESTEPS);
}
inline __device__ void incSend(int i) {
sendStep[i] += SLICESTEPS;
}
inline __device__ void postSend() {
if (sendConnTailPtr) {
if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
}
if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
STORE(connTailPtr, step += SLICESTEPS);
}
template <int DIRECTRECV>
inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
#else
return recvPtr(i);
#endif
}
template <int DIRECTSEND>
inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
#else
return sendPtr(i);
#endif
}
template <int DIRECTRECV>
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
#else
return sliceInc;
#endif
}
template <int DIRECTSEND>
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
#else
return sliceInc;
#endif
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
@@ -212,148 +168,126 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
const T* srcs[RECV*NRECV+SRC];
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
if (RECV) {
if (SRC) srcs[1] = recvPtr(0);
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
}
T* dsts[SEND*NSEND+DST];
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
if (SEND) {
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(dataSize, nelem-offset));
#ifdef ENABLE_PROFILING
uint64_t t0 = __rtc64();
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
#endif
if (SEND) waitSend(realSize*sizeof(T));
if (RECV) waitRecv();
if (realSize > 0) {
barrier();
if (tid < nworkers) {
if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
if (realSize > 0) {
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
#endif
#if defined(RCCL_USE_DIRECT_BUFFER)
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
subBarrier();
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
#else
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
#endif
}
barrier();
FOR_SEND(incSend);
FOR_RECV(incRecv);
if (tid >= nthreads-WARP_SIZE) {
if (SEND) {
if (realSize > 0 && wid == 0) __threadfence_system();
__syncwarp();
postSend();
}
if (RECV) postRecv();
}
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
__syncwarp();
if (SEND && (role & ROLE_POST_SEND)) postSend();
if (RECV && (role & ROLE_POST_RECV)) postRecv();
offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
recvStep[i] = LOAD(&conn->step);
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
#if defined(RCCL_USE_DIRECT_BUFFER)
recvDirectBuff[i] = NULL;
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
recvDirectBuff[i] = directBuff;
if (tid == 0) STORE(conn->ptrExchange, directBuff);
}
#endif
if (wid == i) recvConn = conn;
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
nrecv++;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
recvConnTailPtr = LOAD(&recvConn->tail);
recvConnTailCache = LOAD(recvConnTailPtr);
}
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = LOAD(&recvConn->head);
// Return credits in case we rounded up.
STORE(recvConnHeadPtr, recvConnHead);
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
conn = &channel->devPeers[peer].recv.conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_RECV) {
connHeadPtr = conn->head;
// Return credits in case we rounded up.
STORE(connHeadPtr, step);
}
if (role & ROLE_WAIT_RECV) {
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
// direct = directBuff;
// *conn->ptrExchange = directBuff;
//}
connTailPtr = conn->tail;
connTailCache = LOAD(connTailPtr);
connPtrsFifoPtr = conn->ptrsFifo;
}
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
sendStep[i] = LOAD(&conn->step);
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
#if defined(RCCL_USE_DIRECT_BUFFER)
sendDirectBuff[i] = NULL;
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
void* volatile* ptr = LOAD(&conn->ptrExchange);
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
barrier();
if (tid == 0) STORE(ptr, NULL);
}
#endif
if (wid == i) sendConn = conn;
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
nsend++;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < nsend) {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnFifoPtr = LOAD(&sendConn->fifo);
}
if (tid >= nthreads-WARP_SIZE && wid < nsend) {
sendConnTailPtr = LOAD(&sendConn->tail);
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
conn = &channel->devPeers[peer].send.conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_SEND) {
connTailPtr = conn->tail;
}
if (role & ROLE_WAIT_SEND) {
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
// void* volatile* ptr = conn->ptrExchange;
// while ((direct = (T*)(*ptr)) == NULL);
// *ptr = NULL;
//}
connHeadPtr = conn->head;
connHeadCache = LOAD(connHeadPtr);
connSizesFifoPtr = conn->sizesFifo;
connPtrsFifoPtr = conn->ptrsFifo;
}
}
}
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
STORE(&recvConn->step, recvConnHead);
__threadfence_system();
}
}
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
STORE(&sendConn->step, sendConnHead);
__device__ __forceinline__ void saveSync() {
if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
conn->step = step;
__threadfence_system();
}
}
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
barriers = channel->barrier;
barrier_next = channel->barrier_next;
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
nthreads = nworkers;
// For send operations, we need an extra warp to overlap the threadfence and the copy
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
// nthreads += postThreads;
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
loadRecvSync();
loadSendSync();
for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
#define SYNC_GROUP 8
static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
int g = tid / SYNC_GROUP;
int ng = nthreads / SYNC_GROUP;
index = tid % SYNC_GROUP;
if (g == 0) {
if (index < nrecv) role |= ROLE_WAIT_RECV;
if (index == nrecv) role |= ROLE_SRC;
} else if (g == 1) {
if (index < nsend) role |= ROLE_WAIT_SEND;
if (index == nsend) role |= ROLE_DST;
} else if (g == ng - 2) {
if (index < nrecv) role |= ROLE_POST_RECV;
} else if (g == ng - 1) {
if (index < nsend) role |= ROLE_POST_SEND;
}
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
loadRecvConn(channel, directBuff);
loadSendConn(channel);
}
__device__ __forceinline__ void
@@ -414,8 +348,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
__device__ __forceinline__ ~ncclPrimitives() {
// Save steps for the next operation
saveRecvSync();
saveSendSync();
saveSync();
}
};
@@ -424,10 +357,10 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
#define ACCUMULATE_COUNTER(prim) \
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
+ ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
#else
@@ -205,7 +205,7 @@ class ncclLLPrimitives {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnHead = LOAD(&sendConn->step);
sendConnFifoPtr = LOAD(&sendConn->fifo);
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
}
}
@@ -118,9 +118,14 @@ class ncclLL128Primitives {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) {
uint64_t v0, v1;
load128(src64Ptr+u*WARP_SIZE, v0, v1);
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
//load128(src64Ptr+u*WARP_SIZE, v0, v1);
asm volatile ("flat_load_dwordx4 %0, %1\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(src64Ptr+u*WARP_SIZE));
//storeShmem128(shmemAsmPtr+u*WARP_SIZE, i2[0], i2[1]);
*(shmemAsmPtr+u*WARP_SIZE) = i2[0];
*(shmemAsmPtr+u*WARP_SIZE+1) = i2[1];
}
}
#endif
@@ -135,15 +140,24 @@ class ncclLL128Primitives {
template <int ELEMS_PER_THREAD>
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
uint64_t v[ELEMS_PER_THREAD];
using Velem = uint64_t __attribute__((ext_vector_type(ELEMS_PER_THREAD)));
Velem v;
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
v[u] = *(shmemAsmPtr+u*WARP_SIZE);
v[u+1] = *(shmemAsmPtr+u*WARP_SIZE+1);
//loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
}
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
//if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = v[u+1];//
if (u*WARP_SIZE < maxOffset) asm volatile ("flat_store_dwordx4 %0, %1\n"
"s_waitcnt vmcnt(0)\n" : : "v"(dst64Ptr+u*WARP_SIZE), "v"(i2));
}
}
@@ -176,45 +190,52 @@ class ncclLL128Primitives {
uint64_t flag = recvFlag(0);
uint64_t* ptr = recvPtr(0)+ll128Offset;
bool needReload;
uint64_t v0, v1;
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
do {
if (wid == 0) STORE(sync, 0);
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (i2[1] != flag);
}
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
if (LOAD(sync) == 0) break;
} while (checkAbort(0, 0) == 0);
} while (LOAD(sync) && checkAbort(0, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = SRC ? MULTI<FUNC, T>()(i2[0], v[u]) : i2[0];
v[u+1] = SRC ? MULTI<FUNC, T>()(i2[1], v[u+1]) : i2[1];
}
for (int i=1; i<NRECV && i<nrecv; i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
uint64_t v0, v1;
Vec i2;
do {
if (wid == 0) STORE(sync, 0);
needReload = false;
needReload = 0;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (i2[1] != flag);
}
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
if (LOAD(sync) == 0) break;
} while (checkAbort(i, 0) == 0);
} while (LOAD(sync) && checkAbort(i, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = MULTI<FUNC, T>()(v0, v[u]);
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = MULTI<FUNC, T>()(i2[0], v[u]);
v[u+1] = MULTI<FUNC, T>()(i2[1], v[u+1]);
}
}
}
@@ -223,18 +244,30 @@ class ncclLL128Primitives {
/************************ Send **************************/
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) {
int flag = sendFlag(i);
uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = flagThread ? flag : v[u+1];//
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
}
}
int flag = sendFlag(0);
uint64_t flag = sendFlag(0);
uint64_t* ptr = sendPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = flagThread ? flag : v[u+1];//
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
}
}
/********************** End Send ************************/
@@ -279,7 +312,7 @@ class ncclLL128Primitives {
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
if (SRC) {
int done = 0;
if ((((uint64_t)srcPtr)&0xf) == 0) {
if ((((uint64_t)srcPtr)&0x3) == 0) {
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
@@ -290,7 +323,7 @@ class ncclLL128Primitives {
__syncwarp();
if (DST) {
int done = 0;
if ((((uint64_t)dstPtr)&0xf) == 0) {
if ((((uint64_t)dstPtr)&0x3) == 0) {
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
@@ -330,10 +363,10 @@ class ncclLL128Primitives {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnHead = LOAD(&sendConn->step);
sendConnFifoPtr = LOAD(&sendConn->fifo);
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
}
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
if (sendConn->fifo) {
if (sendConn->sizesFifo) {
sendConnTailPtr = LOAD(&sendConn->tail);
sendConnTail = LOAD(&sendConn->step);
}
@@ -357,12 +390,7 @@ class ncclLL128Primitives {
public:
__device__ __forceinline__
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
// for __any_sync
if (NSEND > NRECV)
sync = channel->sync + 2 + tid/WARP_SIZE;
else
sync = channel->sync + tid/WARP_SIZE;
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid), sync(ncclShmem->sync+warp) {
// Make sure step is updated before we read it.
barrier();
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduce, ncclCollReduce);
IMPL_COLL_R(Reduce);
+123 -129
Wyświetl plik
@@ -9,151 +9,145 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
prims.send(thisInput+offset, nelem);
} else if (rank == root) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
prims.send(thisInput+offset, nelem);
} else if (rank == root) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}
}
}
};
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
}
}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
IMPL_COLL_R(ReduceScatter);
@@ -9,195 +9,189 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
prims.send(thisInput+offset, nelem);
prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.recvReduceSend(thisInput+offset, nelem);
prims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
};
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
LLprims.send(thisInput+offset, nelem);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
LLprims.send(thisInput+offset, nelem);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
};
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,5 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
IMPL_COLL_P(SendRecv);
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,74 +8,85 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->p2p.nThreads;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* firstArgs) {
struct ncclWorkElem* args = firstArgs;
int tid = threadIdx.x;
int group = 0;
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
int nThreadsSegment = args->p2p.nThreads;
if (nThreadsSegment == 0) return; // Nothing else to do
int groupRecv = group;
group += 1;
int groupSend = group;
group += 1;
if (tid < nThreadsSegment) {
const int nThreads = nThreadsSegment;
// Compute pointers
const T* sendbuff = (const T*)args->sendbuff;
T* recvbuff = (T*)args->recvbuff;
// Compute pointers
const T* sendbuff = (const T*)args->sendbuff;
T* recvbuff = (T*)args->recvbuff;
const ssize_t sendCount = args->p2p.sendCount;
const ssize_t recvCount = args->p2p.recvCount;
if (args->p2p.delta < 0 ) return; // No-op
const int delta = args->p2p.delta;
if (delta == 0) {
if (tid < nThreads && sendbuff != recvbuff) {
// local copy : ReduceOrCopyMulti takes an int as number of elements,
// so we split it in blocks of 1G elements.
int blockSize = 1<<30;
for (size_t offset=0; offset<sendCount; offset += blockSize) {
size_t remaining = sendCount - offset;
if (remaining < blockSize) blockSize = remaining;
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
sendbuff += blockSize; recvbuff += blockSize;
}
}
} else {
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
if (args->p2p.delta == 0) {
if (tid < nthreads && sendbuff != recvbuff) {
// local copy : ReduceOrCopyMulti takes an int as number of elements,
// so we split it in blocks of 1G elements.
int blockSize = 1<<30;
for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
size_t remaining = args->p2p.sendCount - offset;
if (remaining < blockSize) blockSize = remaining;
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
sendbuff += blockSize; recvbuff += blockSize;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
int nThreadsSplit = nThreads/2;
if ((tid < nThreadsSplit) && recvCount >= 0) {
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
int nt = nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
if (recvCount == 0) {
prims.recv(recvbuff, 0);
} else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
int realChunkSize = min(chunkSize, recvCount-offset);
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, recvCount-offset);
prims.directRecv(recvbuff+offset, offset, nelem);
}
}
if ((tid >= nThreadsSplit) && sendCount >= 0) {
int peer = (comm->rank+delta)%comm->nRanks;
int nt = nThreads-nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
if (sendCount == 0) {
prims.send(sendbuff, 0);
} else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
int realChunkSize = min(chunkSize, sendCount-offset);
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, sendCount-offset);
prims.directSend(sendbuff+offset, offset, nelem);
}
}
}
}
tid -= nThreadsSegment;
if (tid < 0) return;
args++;
}
}
return;
}
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
int nthreadsSplit = nthreads/2;
// We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
// receive threads, but then we define all peers to -1 since sender threads don't receive and
// receive threads don't send.
int peerNone[2] = {-1,-1};
if (tid < nthreadsSplit ) {
const ssize_t sendSize = args->p2p.sendCount;
if (sendSize < 0) return;
int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
if (sendSize == 0) {
prims.send(sendbuff, 0);
} else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
int realChunkSize = min(stepSize, sendSize-offset);
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, sendSize-offset);
prims.directSend(sendbuff+offset, offset, nelem);
}
} else {
const ssize_t recvSize = args->p2p.recvCount;
if (recvSize < 0) return;
int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
if (recvSize == 0) {
prims.recv(recvbuff, 0);
} else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
int realChunkSize = min(stepSize, recvSize-offset);
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, recvSize-offset);
prims.directRecv(recvbuff+offset, offset, nelem);
}
}
}
};
@@ -29,9 +29,10 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
return ncclSuccess;
}
else {
struct ncclInfo info = { ncclCollGather, "Gather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
//struct ncclInfo info = { ncclCollGather, "Gather",
// sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
// GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
//return ncclEnqueueCheck(&info);
return ncclInternalError;
}
}
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduce, "Reduce",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
@@ -29,9 +29,10 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
return ncclSuccess;
}
else {
struct ncclInfo info = { ncclCollScatter, "Scatter",
sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
//struct ncclInfo info = { ncclCollScatter, "Scatter",
// sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
// SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
//return ncclEnqueueCheck(&info);
return ncclInternalError;
}
}
@@ -13,7 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollSendRecv, "Send",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncSendRecv, "Send",
sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
@@ -27,7 +28,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollSendRecv, "Recv",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncSendRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
+3 -3
Wyświetl plik
@@ -128,7 +128,7 @@ void ncclDebugInit() {
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
if (ncclDebugLevel < level) return;
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
// Gather the rank information. This can take > 1us so we want to make sure
// we only do it when needed.
@@ -145,11 +145,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
if (level == NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
else if (level == NCCL_LOG_INFO)
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
+253 -213
Wyświetl plik
@@ -8,59 +8,58 @@
#include "enqueue.h"
#include "argcheck.h"
#include "coll_net.h"
#include "../graph/topo.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC5(func, algo, redop, dtype) \
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
NCCL_KERN_NAME(func, algo, LL, redop, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64), \
NCCL_FUNC4(coll, op, b16)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double), \
NCCL_FUNC4(func, redop, rccl_bfloat16)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum)
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum)
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
typedef void(*ncclKern_t)(struct ncclDevComm*);
typedef void(*ncclKern_t)(struct ncclWorkElem first);
// Must be consistent with the ncclFuncSet enum
static ncclKern_t const ncclKerns[1] = {
NCCL_KERN_NAME(ncclSendRecv, copy, i8)
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
};
/*****************************************************************************/
@@ -70,12 +69,8 @@ static ncclKern_t const ncclKerns[1] = {
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
if (cgMode & 0x01) {
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
// These flags are to reduce the latency of using this API
#if __HIP__
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
#else
0));
#endif
// These flags are to reduce the latency of using this API
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
return ncclSuccess;
}
int savedDev;
@@ -83,44 +78,62 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
for (int i = 0; i < numDevices; i++) {
hipLaunchParams* params = paramsList+i;
CUDACHECK(hipSetDevice(cudaDevs[i]));
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
}
CUDACHECK(hipSetDevice(savedDev));
return ncclSuccess;
}
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
if (channel->workCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
return ncclInvalidUsage;
}
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
struct ncclWorkElem* e = w->elems;
volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
while (LOAD(activePtr) != 0) sched_yield();
memset(w, 0, sizeof(struct ncclWork));
// Initialize with work elem if provided
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
STORE(&e->active, 1);
e->index = opIndex;
channel->workFifoTail++;
channel->workCount++;
if (work) *work = w;
return ncclSuccess;
}
static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
// Only launch blocks where we have work to do.
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
if (comm->channels[c].collCount) params->gridDim.x = c+1;
for (int c=0; c<comm->p2pnChannels; c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<params->gridDim.x; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->collCount == 0) {
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (activePtr[0] != 0) sched_yield();
c->args.p2p.delta = -1; // no-op
c->funcIndex = FUNC_INDEX_P2P;
c->args.comm = comm->devComm;
c->active = 1;
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
channel->collCount++;
if (channel->workCount == 0) {
struct ncclWork* w;
NCCLCHECK(getNextOp(channel, &w, NULL));
struct ncclWorkElem* e = w->elems;
e->comm = comm->devComm;
e->funcIndex = FUNC_INDEX_P2P;
e->p2p.nThreads = 0;
}
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
}
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
struct ncclChannel* c0 = comm->channels;
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
comm->args = comm->devComm;
params->func = (void *)ncclKerns[0];
return ncclSuccess;
}
@@ -131,7 +144,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
bool done = false;
while (done == false) {
if (val >= comm->intraRanks) {
WARN("Trying to launch too many collectives");
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
if (val+1 == comm->intraRanks) {
@@ -151,7 +164,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = LOAD(ptr);
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
WARN("Trying to launch too many collectives");
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
return ncclInternalError;
}
return ncclSuccess;
@@ -212,7 +225,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
if (comm->launchMode == ncclComm::PARALLEL) {
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
} else {
NCCLCHECK(ncclCpuBarrierOut(comm));
}
@@ -222,13 +235,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
// launch and the ncclProxyStart call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
uint64_t max = 0ULL;
for (int r=0; r<params->gridDim.x; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->collStart = channel->collFifoTail;
channel->collCount = 0;
max = std::max(max, channel->workFifoTail);
channel->workCount = 0;
}
for (int r=0; r<comm->p2pnChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->workFifoTail = max;
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = comm->opCount;
comm->lastOpCount = max;
NCCLCHECK(ncclProxyStart(comm));
return ncclSuccess;
}
@@ -273,10 +291,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
}
}
}
if (info->coll == ncclCollAllToAll || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv) {
info->algorithm = NCCL_ALGO_RING;
info->protocol = NCCL_PROTO_SIMPLE;
}
if (info->algorithm == -1 || info->protocol == -1) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
@@ -284,16 +298,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
if (info->comm->topo->type == RCCL_TOPO_4P2H_ROME && (info->coll == ncclCollAllToAll ||
info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv))
nc = 2;
int nc = (info->nChannels > 0) ? info->nChannels :
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
// do not reduce channels in case of alltoall
if (info->algorithm != NCCL_ALGO_COLLNET && info->coll != ncclCollAllToAll &&
info->coll != ncclCollGather && info->coll != ncclCollScatter && info->coll != ncclCollAllToAllv && nc >= 2) nc--;
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// do not reduce threads count on VEGA
#else
@@ -303,7 +313,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
#endif
info->nChannels = nc;
info->nThreads = nt;
@@ -312,20 +323,15 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
switch (info->coll) {
case ncclCollBroadcast:
case ncclFuncBroadcast:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
case ncclCollReduce:
case ncclFuncReduce:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
case ncclCollReduceScatter:
case ncclCollAllGather:
case ncclFuncReduceScatter:
case ncclFuncAllGather:
info->pattern = ncclPatternRing; break;
case ncclCollAllReduce:
case ncclFuncAllReduce:
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
case ncclCollGather:
case ncclCollScatter:
case ncclCollAllToAll:
case ncclCollAllToAllv:
info->pattern = ncclPatternAll; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
return ncclInternalError;
@@ -342,8 +348,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineTo:
case ncclPatternCollTreeUp:
case ncclPatternCollTreeDown:
case ncclPatternAll:
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
@@ -355,41 +360,23 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
return ncclSuccess;
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
coll->args.sendbuff = info->sendbuff;
coll->args.recvbuff = info->recvbuff;
coll->args.comm = info->comm->devComm;
coll->args.opCount = info->comm->opCount;
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
work->comm = info->comm->devComm;
if (info->coll == ncclCollSendRecv) {
coll->args.p2p.sendCount = info->sendbytes;
coll->args.p2p.recvCount = info->recvbytes;
coll->args.p2p.delta = info->delta;
coll->funcIndex = FUNC_INDEX_P2P;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
#else
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
#endif
return ncclSuccess;
}
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
if (info->coll == ncclCollAllToAllv) {
coll->args.a2av.count = info->count;
coll->args.a2av.nChannels = info->nChannels;
coll->args.a2av.nThreads = info->nThreads;
} else {
coll->args.coll.root = info->root;
coll->args.coll.count = info->count;
coll->args.coll.nChannels = info->nChannels;
coll->args.coll.nThreads = info->nThreads;
}
work->opCount = info->comm->opCount;
work->sendbuff = info->sendbuff;
work->recvbuff = info->recvbuff;
work->coll.root = info->root;
work->coll.count = info->count;
work->coll.nChannels = info->nChannels;
work->nThreads = info->nThreads;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -400,25 +387,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
if (info->pattern == ncclPatternTreeUpDown) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
int nNodes = info->comm->nNodes;
float ppn = info->comm->nRanks / (float)nNodes;
@@ -426,7 +413,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
@@ -434,20 +421,20 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops;
if (info->pattern != ncclPatternAll)
nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
else
nLoops = (int)(DIVUP(info->nBytes, (((size_t)((info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1))))*info->comm->nRanks*info->nchunksPerLoop*chunkEffectiveSize));
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->protocol = info->protocol;
proxyArgs->opCount = info->comm->opCount;
proxyArgs->dtype = info->datatype;
proxyArgs->redOp = info->op;
if (info->coll != ncclCollAllToAllv) TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkEffectiveSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
return ncclSuccess;
}
@@ -464,32 +451,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
}
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
if (info->comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
struct ncclColl coll;
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
NCCLCHECK(computeColl(info, &work, &proxyArgs));
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
int nChannels = work.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
info->comm->myParams->gridDim.x % info->comm->nChannels;
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
if (channel->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
return ncclInvalidUsage;
}
// Proxy
proxyArgs.channel = channel;
// Adjust pattern for CollNet based on channel index
@@ -497,77 +478,143 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
}
if (info->coll == ncclCollSendRecv) {
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
NCCLCHECK(ncclProxySaveP2p(info, channel));
} else if (info->coll == ncclCollAllToAll || info->coll == ncclCollScatter || info->coll == ncclCollGather || info->coll == ncclCollAllToAllv) {
NCCLCHECK(ncclProxySaveA2a(&proxyArgs, info));
} else {
NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
}
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
info->comm->myParams->gridDim.x++;
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (LOAD(activePtr) != 0) sched_yield();
memcpy(c, &coll, sizeof(struct ncclColl));
if (info->coll == ncclCollAllToAllv) {
c->args.a2av.extra = channel->collectivesExtra + info->comm->nRanks*4*opIndex;
memcpy(c->args.a2av.extra, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
c->args.a2av.bid = bid % coll.args.coll.nChannels;
} else if (info->coll != ncclCollSendRecv)
c->args.coll.bid = bid % coll.args.coll.nChannels;
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
channel->collCount++;
work.coll.bid = bid % nChannels;
NCCLCHECK(getNextOp(channel, NULL, &work));
}
info->comm->opCount++;
return ncclSuccess;
}
// Save p2p operations in comm->p2plist. Operations will be posted to channels
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
if (comm->asyncOpCount == 0) {
return ncclSuccess;
} else if (comm->asyncOpCount == 1) {
// No aggregation
struct ncclInfo* info = comm->asyncOps;
info->nChannels = 0;
NCCLCHECK(ncclSaveKernel(info));
} else {
// Aggregation
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
// Reduce the per-channel size if we cannot fully utilize the channels
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
for (int c = 0; c < comm->asyncOpCount; c++) {
struct ncclInfo* info = comm->asyncOps+c;
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
NCCLCHECK(ncclSaveKernel(info));
}
}
// Reset counters
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
return ncclSuccess;
}
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
ncclComm_t comm = info->comm;
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
comm->asyncOpCount++;
comm->asyncTotalSize += info->nBytes;
return ncclSuccess;
}
// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
// during ncclGroupEnd()
ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
struct ncclP2Plist* p2plist = &comm->p2plist;
int peer = info->root;
p2plist->count++;
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
if (info->recvbuff == NULL) {
if (info->opName[0] == 'S') { // Send
if (peer != comm->rank) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send.connected == 0) {
p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
}
}
}
p2plist->peerlist[info->root].sendbytes = nBytes;
p2plist->peerlist[info->root].sendbuff = info->sendbuff;
NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
comm->p2pSendCount++;
} else {
if (peer != comm->rank) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
}
}
}
p2plist->peerlist[info->root].recvbytes = nBytes;
p2plist->peerlist[info->root].recvbuff = info->recvbuff;
NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
comm->p2pRecvCount++;
}
return ncclSuccess;
}
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
if (work->elems[s].p2p.nThreads == 0) return s;
}
return -1;
}
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
struct ncclWorkElem* elem = work->elems+s;
elem->comm = info->comm->devComm;
elem->funcIndex = FUNC_INDEX_P2P;
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
elem->sendbuff = info->sendbuff;
elem->recvbuff = info->recvbuff;
elem->opCount = info->comm->lastOpCount;
elem->p2p.sendCount = info->sendbytes;
elem->p2p.recvCount = info->recvbytes;
elem->p2p.delta = info->delta;
const int nsegments = s+1;
int nThreads = 512;
while (nsegments*nThreads > 256) nThreads /= 2;
//if (nThreads >= 128) nThreads += WARP_SIZE;
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
return ncclSuccess;
}
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
int channelId = info->channelId;
struct ncclChannel* channel = info->comm->channels+channelId;
// Try to reuse last p2p operation if not full yet
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
int segment = -1;
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
// Try to pack more segments into a single operation
segment = getSegment(info, w);
}
if (segment == -1) {
NCCLCHECK(getNextOp(channel, &w, NULL));
segment = 0;
}
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
NCCLCHECK(saveP2pOp(info, w, segment));
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
return ncclSuccess;
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
// Launch asynchronously if needed
if (ncclAsyncMode()) {
@@ -585,19 +632,17 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
NCCLCHECKGOTO(checkSetStream(info), ret, end);
if (info->coll == ncclCollAllToAllv)
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
else
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
if (info->coll == ncclCollSendRecv) { //p2p stored separately
if (info->coll == ncclFuncSendRecv) { //p2p stored separately
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
} else {
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
}
end:
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
@@ -608,12 +653,7 @@ end:
NCCLCHECK(ArgsCheck(info));
NCCLCHECK(checkSetStream(info));
if (info->coll == ncclCollAllToAllv)
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
else
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+75 -100
Wyświetl plik
@@ -25,14 +25,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->treeUp.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
channel->treeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
channel->collTreeUp.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
channel->collTreeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
channel->tree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
channel->collTree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
@@ -46,33 +42,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
}
if (treeIntra[i] == rank) {
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
int parentIndex = 0;
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
// up/down go in reverse directions
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
// Down tree is common
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
channel->treeDn.up = treeIntra[prev];
channel->treeDn.down[0] = treeIntra[next];
// Up tree depends on the pattern
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
topoRanks->treeToChild1[c] = treeIntra[child1Index];
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
}
if (collNetIntra[i] == rank) {
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
// CollTrees are always symmetric, i.e.
// up/down go in reverse directions
channel->collTreeDn.up = collNetIntra[prev];
channel->collTreeDn.down[0] = collNetIntra[next];
channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
channel->collTreeUp.up = channel->collTreeDn.up;
channel->collTree.up = collNetIntra[prev];
channel->collTree.down[0] = collNetIntra[next];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
@@ -122,72 +106,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
return ncclSuccess;
}
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
if (u0 != -1) tree0->up = indexes[u0];
if (u1 != -1) tree1->up = indexes[u1];
static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
if (u == -1) return ncclSuccess;
tree->up = indexes[u];
return ncclSuccess;
}
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
if (d == -1) return ncclSuccess;
int x = 0;
if (down[x] >= 0) x++;
if (down[x] >= 0) {
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
if (x == NCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
return ncclInternalError;
}
if (r0 != -1) down[x++] = indexes[r0];
if (r1 != -1) down[x++] = indexes[r1];
tree->down[x] = indexes[d];
return ncclSuccess;
}
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
return ncclSuccess;
}
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
if (tree->down[0] == upRank) tree->down[0] = -1;
if (rank == upRank) tree->up = -1;
return ncclSuccess;
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
int* indexesSend, *indexesRecv;
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
int* ranksToParent, *ranksToChild0, *ranksToChild1;
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int u0, d0_0, d0_1, u1, d1_0, d1_1;
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
int root = indexesSend[node];
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
channel0->treeUp.depth = channel1->treeUp.depth = depth;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
free(indexesSend);
free(indexesRecv);
free(ranksToParent);
free(ranksToChild0);
free(ranksToChild1);
return ncclSuccess;
}
@@ -200,13 +178,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclChannel* channel = comm->channels+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
channel->collTree.down[0] = -1;
}
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
}
int recvIndex = 0; // recv GPU index is always 0
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
@@ -214,13 +192,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
channel->collTree.down[0] = -1;
}
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
}
return ncclSuccess;
}
@@ -255,35 +233,33 @@ int ncclMaxNchannels() {
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int nranks = comm->nRanks;
int nChannels = comm->nChannels;
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
for (int i=0; i<nranks; i++) {
for (int c=0; c<nChannels;c++) {
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -317,10 +293,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeUpRecv);
free(treeUpSend);
free(treeDnRecv);
free(treeDnSend);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
return ncclSuccess;
}
+27 -29
Wyświetl plik
@@ -166,24 +166,20 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? 0 : link->type;
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
// Ignore Power CPU in an NVLink path
if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
// Set 1 hop NVLink as NVB
if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
// Disallow GPUs as intermediate steps for now
if (remNode->type != GPU) {
int i;
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
}
int i;
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
@@ -303,7 +299,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (l == -1) {
char* str = getenv(levelEnv);
if (str) {
for (int i=0; i<PATH_NET; i++) {
for (int i=0; i<=PATH_SYS; i++) {
if (strcmp(str, topoPathTypeStr[i]) == 0) {
l = i;
break;
@@ -325,9 +321,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
}
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
*read = 0;
if (read) *read = 0;
if (intermediateRank) *intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
@@ -337,7 +334,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
// GPU not found, we can't use p2p.
return ncclSuccess;
}
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
if (path->count == 2) {
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
if (intermediateNode->type == GPU && intermediateRank) {
*intermediateRank = intermediateNode->gpu.rank;
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
@@ -366,7 +372,7 @@ compare:
if (path->type == PATH_NVL) {
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
// Enable P2P Read for Ampere/NVLink only
if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
}
return ncclSuccess;
@@ -456,8 +462,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p, read;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
@@ -565,8 +571,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
// Local rank
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
if (path->type == PATH_NVL) {
int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
} else {
*nChannels = 2;
@@ -600,16 +605,9 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
}
}
if (comm->topo->type == RCCL_TOPO_4P2H_ROME) {
// Adjust P2P channels on Rome
comm->p2pnChannelsPerPeer = 2;
comm->p2pnChannels = 2;
}
else {
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
}
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
// Init channels that weren't used so far
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
+1 -1
Wyświetl plik
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
char prefix[40];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+47 -18
Wyświetl plik
@@ -25,9 +25,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
}
return maxWidth;
}
static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
float nvlinkWidth = 0.0, pciWidth = 0.0;
for (int l=0; l<gpu->nlinks; l++) {
struct ncclTopoLink* link = gpu->links+l;
if (link->type == LINK_NVL) nvlinkWidth += link->width;
if (link->type == LINK_PCI) pciWidth = link->width;
}
return std::max(pciWidth, nvlinkWidth);
}
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
system->maxWidth = 0.0;
system->type = 0;
system->totalWidth = 0.0;
int inter = system->nodes[NET].count;
if (inter == 0 && system->nodes[GPU].count == 1) {
system->maxWidth = LOC_WIDTH;
@@ -36,6 +45,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
}
return ncclSuccess;
}
@@ -293,7 +303,6 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
const uint64_t flag = 1ULL<<(graph->nChannels);
struct ncclTopoNode* gpu;
NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
if (gpu) {
gpu->used ^= flag;
@@ -352,11 +361,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
// Balanced Tree : count half of the bandwidth on first two GPUs
int nextBackToNet = -1;
float speedInterSave = graph->speedInter;
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
// Count half of the bandwidth on each of the first two GPUs
if (step == 0) nextBackToNet = 1;
else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
graph->speedInter /= 2;
}
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
graph->speedInter = speedInterSave;
if (net) {
graph->inter[graph->nChannels*2+1] = net->id;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
graph->speedInter = speedInterSave;
}
}
}
@@ -493,13 +517,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
if (system->nodes[NET].count) {
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
else *backToNet = 1;
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
else *backToNet = 0;
*backToFirstRank = -1;
} else {
*backToNet = -1;
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
}
return ncclSuccess;
@@ -544,7 +567,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
/* User defined graph from XML file */
/************************************/
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter+2*c;
@@ -1062,7 +1085,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#else
float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#endif
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
@@ -1109,11 +1132,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// TODO: benchmark balance tree vs split tree
//if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
#else
// SPLIT_TREE works better on older archs.
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
#endif
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
// First try crossnic, then decrease speed and finally increase speedIntra.
tmpGraph.pattern = graph->pattern;
int pass = 1;
int speedIndex = 0;
while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
@@ -1128,7 +1160,7 @@ search:
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
@@ -1138,7 +1170,8 @@ search:
}
#endif
// Optimal solution, stop here
if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
if (time == -1) goto done;
if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
if (pass == 1) {
// First pass, we don't have a solution yet ; try other options
@@ -1152,7 +1185,7 @@ search:
if (time != -1) globalTimeout += time;
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
if (globalTimeout < 0) goto done;
if (globalTimeout < 0 && graph->nChannels) goto done;
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
@@ -1167,10 +1200,6 @@ search:
tmpGraph.typeInter = PATH_PIX;
// Try a simpler tree
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
goto search;
}
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
goto search;
+30 -7
Wyświetl plik
@@ -20,18 +20,17 @@
#endif
#include "xml.h"
#include "cpuset.h"
#include <numa.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
#else
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
#endif
/******************************************************************/
@@ -226,7 +225,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
@@ -515,7 +514,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
}
}
if (remote) {
int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
if (remote->type != GPU) {
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
@@ -600,6 +599,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
@@ -614,6 +614,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(collNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -631,6 +632,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(ncclNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -639,6 +641,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
@@ -747,3 +752,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
if (system->nodes[GPU].count == 0) return ncclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for (int g=1; g<system->nodes[GPU].count; g++) {
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if (ccMin) *ccMin = min;
if (ccMax) *ccMax = max;
return ncclSuccess;
}
+27 -12
Wyświetl plik
@@ -13,8 +13,10 @@
#include <sched.h>
#define LOC_WIDTH 5000.0
#define PASCAL_NVLINK_WIDTH 18.0
#define VOLTA_NVLINK_WIDTH 21.0
#define SM60_NVLINK_WIDTH 18.0
#define SM70_NVLINK_WIDTH 21.0
#define SM80_NVLINK_WIDTH 21.0
#define SM86_NVLINK_WIDTH 12.0
#define PCI_WIDTH 12.0 // PCI Gen3 x16
#define QPI_WIDTH 6.0
#define SKL_QPI_WIDTH 9.0
@@ -40,20 +42,21 @@ extern const char* topoNodeTypeStr[];
// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
#define LINK_PCI 2
// Skipping 3 for PATH_PXB
// Skipping 4 for PATH_PHB
#define LINK_SYS 5
#define LINK_NET 6
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
// Skipping 5 for PATH_PHB
#define LINK_SYS 6
#define LINK_NET 7
extern const char* topoLinkTypeStr[];
#define PATH_LOC 0
#define PATH_NVL 1
#define PATH_PIX 2
#define PATH_PXB 3
#define PATH_PHB 4
#define PATH_SYS 5
#define PATH_NET 6
#define PATH_NVB 2
#define PATH_PIX 3
#define PATH_PXB 4
#define PATH_PHB 5
#define PATH_SYS 6
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
@@ -125,6 +128,7 @@ struct ncclTopoNodeSet {
struct ncclTopoSystem {
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
float maxWidth;
float totalWidth;
int type;
};
@@ -141,6 +145,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[type].count; i++) {
@@ -163,4 +169,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
return ncclInternalError;
}
// Returns NVLink speed in GB/s
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
return
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
SM80_NVLINK_WIDTH;
}
#endif
+28 -25
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,7 +28,7 @@
* / \ / \ / \ \
* 1 3 5 7 9 11 13
*/
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
int up, down0, down1;
int bit;
for (bit=1; bit<nranks; bit<<=1) {
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
if (rank == 0) {
*u = -1;
*d0 = nranks > 1 ? bit >> 1 : -1;
*d1 = -1;
*d0 = -1;
// Child rank is > 0 so it has to be our child 1, not 0.
*d1 = nranks > 1 ? bit >> 1 : -1;
return ncclSuccess;
}
up = (rank ^ bit) | (bit << 1);
// if smaller than the parent, we are his first child, otherwise we're his second
if (up >= nranks) up = (rank ^ bit);
*parentChildType = (rank < up) ? 0 : 1;
*u = up;
int lowbit = bit >> 1;
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
}
/* Build a double binary tree. Take the previous tree for the first tree.
* For the second tree, we use a mirror tree (if nranks is odd)
* For the second tree, we use a mirror tree (if nranks is even)
*
* 8---------0---------5
* ______/ \______ _____/ \______
* 4 12 1 9
* / \ / \ / \
* 2 6 10 3 7 10
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 11 12
* 0---------------8 3----------------11
* ______/ \ / \______
* 4 \ / 7
* / \ \ / / \
* 2 6 10 1 5 9
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 0 2 4 6 8 10
*
* or shift it by one rank (if nranks is even)
* or shift it by one rank (if nranks is odd).
*
* 8---------0--------------9
* ______/ \ ______/ \
* 4 \ 5 \
* / \ \ / \ \
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 1
* 0---------------8 1---------------9
* ______/ \______ ______/ \______
* 4 12 5 0
* / \ / / \ /
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 12
*/
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
// First tree ... use a btree
ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
// Second tree ... mirror or shift
if (nranks % 2 == 0) {
if (nranks % 2 == 1) {
// shift
int shiftrank = (rank-1+nranks) % nranks;
int u, d0, d1;
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : (u+1) % nranks;
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
} else {
// mirror
int u, d0, d1;
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : nranks-1-u;
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+67 -21
Wyświetl plik
@@ -71,45 +71,66 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
};
// LL128 max BW (per channel) for the different collectives
// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
#else
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
#endif
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
if (comm->nRanks <= 1) return ncclSuccess;
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
int index2 = nNodes <= 2 ? nNodes-1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
double llMaxBw = llMaxBws[index1][index2];
double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
comm->nRanks;
int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
comm->nNodes;
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed;
// Various model refinements
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
double maxTreeBw = comm->nNodes > 2 ?
@@ -118,21 +139,29 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
#else
if (compCap80) busBw = std::min(busBw, 235.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
#endif
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
// Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p];
if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
if (ringGraph->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
@@ -144,10 +173,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
} else if (a == NCCL_ALGO_TREE) {
comm->latencies[coll][a][p] +=
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
} else {
comm->latencies[coll][a][p] +=
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
2 * (nRanks/nNodes-1) * intraLat + interLat;
}
}
}
@@ -168,6 +197,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
}
}
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int pEnable = protoEnable[p];
@@ -178,7 +216,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
// Only disable algo for Allreduce since others only have one
if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
@@ -214,7 +252,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
@@ -263,8 +301,16 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(info->nBytes>>6);
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
#else
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
#endif
*time = lat + (info->nBytes) / (1000 * bw);
return ncclSuccess;
}
+33 -11
Wyświetl plik
@@ -572,7 +572,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
if (index == -1) {
if (nvmlDev == NULL) {
//WARN("No NVML, trying to use CUDA instead");
const char* busId;
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
@@ -714,6 +713,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
char* path;
NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
free(path);
}
}
}
@@ -725,10 +725,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
nvmlDevice_t nvmlDev;
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
nvmlDevice_t nvmlDev = NULL;
static int nvmlInit = 0;
if (nvmlInit == 0) {
nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
}
if (nvmlInit == 1) {
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
}
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
return ncclSuccess;
}
@@ -771,12 +775,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath+offset+1);
NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
if (parent == NULL) {
NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
NCCLCHECK(xmlSetAttr(parent, "busid", busId));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
}
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
@@ -795,6 +795,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
const char* str;
NCCLCHECK(xmlGetAttr(node, "keep", &str));
if (str && strcmp(str, "1") == 0) {
NCCLCHECK(xmlUnsetAttr(node, "keep"));
} else {
// Copy nSubs and subs as they could change as we trim recursively.
struct ncclXmlNode* subs[MAX_SUBS];
int nSubs = node->nSubs;
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
for (int s=0; s<nSubs; s++) {
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
}
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
}
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
return ncclSuccess;
}
/**************************************************/
/* Parser rules for the user-defined graph search */
/**************************************************/
+40 -5
Wyświetl plik
@@ -8,7 +8,7 @@
#define XML_H_
// A few constraints to make the implementation easy
#define MAX_STR_LEN 256
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 32
#define MAX_NODES 1024
@@ -19,10 +19,10 @@
#define NODE_TYPE_SINGLE 3
struct ncclXmlNode {
char name[MAX_STR_LEN];
char name[MAX_STR_LEN+1];
struct {
char key[MAX_STR_LEN];
char value[MAX_STR_LEN];
char key[MAX_STR_LEN+1];
char value[MAX_STR_LEN+1];
} attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
int nAttrs;
int type;
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
/* Remove unneeded parts */
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
/**************/
/* XML Struct */
/* Functions */
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
*index = -1;
const int nAttrs = node->nAttrs;
for (int a=0; a<nAttrs; a++) {
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
*index = a;
return ncclSuccess;
}
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) return ncclSuccess;
for (int i=index+1; i<node->nAttrs; i++) {
strcpy(node->attrs[i-1].key, node->attrs[i].key);
strcpy(node->attrs[i-1].value, node->attrs[i].value);
}
node->nAttrs--;
return ncclSuccess;
}
@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
s->parent = parent;
if (parent) parent->subs[parent->nSubs++] = s;
strncpy(s->name, subName, MAX_STR_LEN);
s->name[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
node->type = NODE_TYPE_NONE;
struct ncclXmlNode* parent = node->parent;
if (parent == NULL) return ncclSuccess;
int shift = 0;
for (int s=0; s<parent->nSubs; s++) {
if (parent->subs[s] == node) shift = 1;
else if (shift) parent->subs[s-1] = parent->subs[s];
}
parent->nSubs--;
return ncclSuccess;
}
+115 -79
Wyświetl plik
@@ -35,7 +35,6 @@ struct ncclInitArgs {
};
struct ncclCollArgs {
ncclComm_t comm;
int connect;
};
enum ncclAsyncFuncType {
@@ -110,6 +109,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclGroupStart);
ncclResult_t ncclGroupStart() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
}
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupStart() {
}
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
1, 1 };
info.delta = delta;
@@ -126,26 +126,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
info.sendbytes = sendbytes;
info.recvbytes = recvbytes;
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
NCCLCHECK(ncclSaveKernel(&info));
NCCLCHECK(ncclSaveP2pKernel(&info));
return ncclSuccess;
}
void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
struct ncclComm* comm = args->coll.comm;
struct ncclChannel* channel = comm->channels+c;
struct ncclP2PConnect* connect = &comm->p2plist.connect;
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
connect->nrecv[c] = 0;
connect->nsend[c] = 0;
}
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
return args;
}
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
int nChannels = minChannels;
while (size > maxSize && nChannels <= maxChannels/2) {
nChannels *= 2;
size = DIVUP(totalSize, nChannels);
}
ALIGN_SIZE(size, minSize);
return size;
}
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
WARN("ncclGroupEnd: not in a group call.");
return ncclInvalidUsage;
@@ -186,29 +192,21 @@ ncclResult_t ncclGroupEnd() {
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
if (p2plist->count != 0) {
struct ncclComm* comm = args->coll.comm;
args->coll.connect = 0;
for (int c=0; c<comm->p2pnChannels; c++)
args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
if (args->coll.connect) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
return ncclSystemError;
}
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect = 0;
}
}
@@ -218,56 +216,98 @@ ncclResult_t ncclGroupEnd() {
struct ncclComm* comm = args->coll.comm;
int rank = comm->rank;
int nRanks = comm->nRanks;
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
if (p2plist->count) {
for (int delta=0; delta<nRanks; delta++) {
struct ncclP2Plist* p2pSends = comm->p2pSends;
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
// Try to use all channels
int nChannelsMax = comm->p2pnChannelsPerPeer;
int nChannelsMin = nChannelsMax;
// Try to use all channels, but one channel per operation.
while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d<=nRanks/4; d++) {
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
int index = 0;
int delta = deltas[index];
sched_delta:
uint32_t from = (rank+nRanks-delta)%nRanks;
uint32_t to = (rank+delta)%nRanks;
struct ncclP2Pinfo* recv = p2pRecvs[from].head;
struct ncclP2Pinfo* send = p2pSends[to].head;
if (recv != NULL || send != NULL) {
ssize_t totRecvBytes = -1, totSendBytes = -1;
if (recv != NULL) totRecvBytes = recv->nbytes;
if (send != NULL) totSendBytes = send->nbytes;
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
// Split each operation on p2pnChannelsPerPeer max.
ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int remaining = 1;
int chunk = 0;
while (remaining) {
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
remaining = 0;
ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int sendRemaining = 1, recvRemaining = 1;
int chunk = 0;
do {
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
ssize_t recvbytes = totRecvBytes-recvOffset;
ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
chunk++;
} while (sendRemaining || recvRemaining);
if (recv) {
NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
comm->p2pRecvCount--;
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
chunk++;
if (send) {
NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
comm->p2pSendCount--;
}
}
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
if (index == 3 && deltas[3] == deltas[2]) index++;
if (index == 3 && deltas[3] == deltas[1]) index++;
if (index < 4) {
delta = deltas[index];
goto sched_delta;
}
}
p2plist->count = 0;
}
}
}
/* Collectives are done in three steps :
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
* 2. Barrier Wait. No CUDA call is permitted
* 3. Enqueue Events. CUDA event wait/enqueue.
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
* hipFree happens between 1 and 3, it could block that CUDA call and
* cudaFree happens between 1 and 3, it could block that CUDA call and
* prevent some ranks from launching their network threads, which would
* prevent the NCCL call from completing, blocking the hipFree call.
* prevent the NCCL call from completing, blocking the cudaFree call.
*/
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
@@ -304,32 +344,28 @@ group_cleanup:
*args->init.newcomm = NULL;
} else {
struct ncclComm* comm = args->coll.comm;
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
// Reset aggregation counters
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
// Dequeue p2p lists
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
struct ncclP2Plist* p2pSends = comm->p2pSends;
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
for (int peer=0; peer<comm->nRanks; peer++) {
while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
}
channel->collFifoTail = channel->collStart;
channel->collCount = 0;
comm->p2pSendCount = comm->p2pRecvCount = 0;
}
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
/* Free all proxy ops in state->nextOps */
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs *op, *start;
pthread_mutex_lock(&state->mutex);
op = start = state->ops;
while (op) {
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
struct ncclProxyArgs* peerOp = op->nextPeer;
while (peerOp) {
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
peerOp = peerOp->nextPeer;
}
op = op->next;
if (op == start) break;
pthread_mutex_lock(&state->poolMutex);
for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
op->next = state->pool;
state->pool = op;
}
comm->opCount = comm->lastOpCount;
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->mutex);
pthread_mutex_unlock(&state->poolMutex);
state->nextOps = NULL;
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
+3 -1
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
+1 -1
Wyświetl plik
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+44 -56
Wyświetl plik
@@ -8,63 +8,60 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
: ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
#define FUNC_INDEX_P2P 1800
#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
ncclFunction_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_KERN_NAME(coll, op, dtype) \
coll##Kernel_##op##_##dtype
#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
ncclKernel_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#define DECL_COLL5(coll, op, dtype) \
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
#define DECL5(func, algo, proto, redop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype) \
DECL_COLL5(coll##LL128, op, dtype)
#define DECL4(func, algo, redop, type) \
DECL5(func, algo, SIMPLE, redop, type) \
DECL5(func, algo, LL, redop, type) \
DECL5(func, algo, LL128, redop, type)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
DECL_COLL4(coll##Tree, op, dtype) \
DECL_COLL4(coll##CollNet, op, dtype)
#define DECL3(func, redop, type) \
DECL4(func, RING, redop, type) \
DECL4(func, TREE, redop, type) \
DECL4(func, COLLNET, redop, type)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
DECL_COLL3(coll, op, u8) \
DECL_COLL3(coll, op, i32) \
DECL_COLL3(coll, op, u32) \
DECL_COLL3(coll, op, i64) \
DECL_COLL3(coll, op, u64) \
DECL_COLL3(coll, op, f16) \
DECL_COLL3(coll, op, f32) \
DECL_COLL3(coll, op, f64) \
DECL_COLL3(coll, op, b16)
#define DECL2(func, redop) \
DECL3(func, redop, int8_t) \
DECL3(func, redop, uint8_t) \
DECL3(func, redop, int32_t) \
DECL3(func, redop, uint32_t) \
DECL3(func, redop, int64_t) \
DECL3(func, redop, uint64_t) \
DECL3(func, redop, half) \
DECL3(func, redop, float) \
DECL3(func, redop, double) \
DECL3(func, redop, rccl_bfloat16)
#define DECL_COLL(coll) \
DECL_COLL2(coll, sum) \
DECL_COLL2(coll, prod) \
DECL_COLL2(coll, min) \
DECL_COLL2(coll, max)
#define DECL(func) \
DECL2(func, Sum) \
DECL2(func, Prod) \
DECL2(func, Min) \
DECL2(func, Max)
#define DECL_ALL_COLLS \
DECL_COLL2(ncclBroadcast, copy) \
DECL_COLL(ncclReduce) \
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
DECL_COLL5(ncclGather, copy, i8) \
DECL_COLL5(ncclScatter, copy, i8) \
DECL_COLL5(ncclAllToAll, copy, i8) \
DECL_COLL5(ncclAllToAllv, copy, i8) \
DECL_COLL5(ncclSendRecv, copy, i8) \
#define DECL_ALL \
DECL2(Broadcast, Sum) \
DECL(Reduce) \
DECL2(AllGather, Sum) \
DECL(ReduceScatter) \
DECL(AllReduce) \
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
DECL_ALL_COLLS
DECL_ALL
// CHUNKSIZE must be a multiple of SLICESIZE
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -84,13 +81,4 @@ DECL_ALL_COLLS
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define SENDRECV_SLICEFACTOR 1
#define GATHER_SLICESTEPS 4
#define GATHER_CHUNKSTEPS 4
#define SCATTER_SLICESTEPS 4
#define SCATTER_CHUNKSTEPS 4
#define ALLTOALL_SLICESTEPS 4
#define ALLTOALL_CHUNKSTEPS 4
#define ALLTOALLV_SLICESTEPS 4
#define ALLTOALLV_CHUNKSTEPS 4
#endif
+17 -4
Wyświetl plik
@@ -52,8 +52,8 @@ struct ncclRecvMem {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
void* ptrsFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
@@ -67,6 +67,10 @@ struct ncclComm {
struct ncclTopoSystem* topo;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect;
uint32_t* connectSend;
uint32_t* connectRecv;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -131,8 +135,8 @@ struct ncclComm {
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclDevComm* args;
struct ncclDevComm** argsptr;
struct ncclWorkElem args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
@@ -140,8 +144,17 @@ struct ncclComm {
// Whether this communicator uses collNet
int collNetSupport;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist p2plist;
struct ncclP2Plist* p2pSends;
struct ncclP2Plist* p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
// RCCL AllToAll/Scatter/Gather API
bool alltoallDisable;
+1
Wyświetl plik
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
#include "alloc.h"
#include "utils.h"
#include "param.h"
#include "nvtx_stub.h"
#endif // end include guard
+2 -2
Wyświetl plik
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
return ncclSuccess;
}
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+44 -64
Wyświetl plik
@@ -23,8 +23,8 @@
#endif
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
@@ -59,6 +59,7 @@ union ncclLLFifoLine {
#define WARP_SIZE 64
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 256
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
@@ -72,7 +73,7 @@ union ncclLLFifoLine {
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 64
#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
@@ -83,15 +84,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_DIRECT_GPU 0x01
#define NCCL_DIRECT_NIC 0x10
#define MAXBARRIERS 2
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -99,9 +97,11 @@ struct ncclConnInfo {
uint64_t *head; // Local for send, remote for recv
int direct; // Direct communication
int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
int *sizesFifo; // Sizes fifo from GPU to proxy
void* *ptrsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
@@ -110,7 +110,6 @@ struct ncclConnInfo {
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct ncclConnector {
@@ -151,68 +150,53 @@ struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclDevComm* comm;
uint64_t opCount;
#define NCCL_MAX_WORK_ELEMENTS 2
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
struct ncclWorkElem {
// Header
struct ncclDevComm* comm;
uint16_t nThreads;
uint16_t funcIndex;
uint16_t index;
uint16_t active;
// local and remote input, output, and buffer
const void * sendbuff;
void * recvbuff;
// Op-specific fields. Make sure the common part stays the
// same on all structs of the union
uint64_t opCount;
// Op-specific fields.
union {
struct {
uint16_t nThreads;
} common;
struct {
uint16_t nThreads;
uint8_t bid;
uint8_t nChannels;
uint32_t root;
size_t count;
size_t lastChunkSize;
} coll;
struct {
uint16_t nThreads;
uint16_t unused;
int32_t delta;
size_t sendCount;
size_t recvCount;
} p2p;
struct {
uint16_t nThreads;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
size_t count;
size_t* extra;
} a2av;
};
};
struct ncclColl {
union {
} coll;
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
size_t sendCount;
size_t recvCount;
int32_t delta;
uint16_t nThreads;
} p2p;
uint64_t align[3];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclWork {
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
};
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree treeUp;
struct ncclTree treeDn;
struct ncclTree collTreeUp;
struct ncclTree collTreeDn;
struct ncclTree tree;
struct ncclTree collTree;
int id;
@@ -221,16 +205,10 @@ struct ncclChannel {
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
size_t* collectivesExtra;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
struct ncclWork* workFifo;
int workCount;
uint64_t workFifoTail; // Only used by CPU
uint32_t* sync;
uint64_t* barrier;
uint64_t* barrier_next;
#ifdef ENABLE_PROFILING
struct timeval tvs;
uint64_t sizes;
@@ -288,9 +266,11 @@ struct ncclProf {
#ifdef ENABLE_COLLTRACE
typedef enum {
ncclCollTraceNotReady,
ncclCollTraceKernelLaunchType,
ncclCollTraceCollEndType,
ncclCollTraceAbortType
ncclCollTraceAbortType,
ncclCollTraceDataType
} ncclCollTraceDataType_t;
struct ncclCollTrace {
@@ -304,7 +284,7 @@ struct ncclCollTrace {
};
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
#define COLLTRACE_NUM_ITEMS 1024
#define COLLTRACE_NUM_ITEMS 8192
#endif
struct ncclDevComm {
+2
Wyświetl plik
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
#endif // End include guard
+9 -9
Wyświetl plik
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// Set CPU affinity
@@ -45,15 +45,16 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
#define NCCL_TOPO_CPU_TYPE_ZEN 3
#define NCCL_TOPO_CPU_TYPE_ROME 4
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
#define NCCL_TOPO_MAX_NODES 256
// Init search. Needs to be done before calling ncclTopoCompute
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
struct ncclTopoGraph {
// Input / output
@@ -84,17 +85,16 @@ struct ncclTopoRanks {
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeUpRecv[MAXCHANNELS];
int treeUpSend[MAXCHANNELS];
int treeDnRecv[MAXCHANNELS];
int treeDnSend[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+1 -7
Wyświetl plik
@@ -20,8 +20,7 @@ typedef enum {
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollTreeUp,
ncclPatternCollTreeDown,
ncclPatternAll
ncclPatternCollTreeDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
@@ -40,11 +39,6 @@ struct ncclInfo {
// Algorithm details
int chunkSteps;
int sliceSteps;
// For alltoallv
const size_t *sendcounts;
const size_t *sdispls;
const size_t *recvcounts;
const size_t *rdispls;
// Computed later
int algorithm;
int protocol;
+15 -12
Wyświetl plik
@@ -15,6 +15,9 @@
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
@@ -29,9 +32,9 @@ typedef struct {
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
}ncclNetProperties_v3_t;
}ncclNetProperties_v4_t;
typedef ncclNetProperties_v3_t ncclNetProperties_t;
typedef ncclNetProperties_v4_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
@@ -41,7 +44,7 @@ typedef struct {
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
@@ -62,7 +65,7 @@ typedef struct {
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
@@ -70,11 +73,11 @@ typedef struct {
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v3_t;
} ncclNet_v4_t;
typedef ncclNet_v3_t ncclNet_t;
typedef ncclNet_v4_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
typedef struct {
// Name of the collective network (mainly for logs)
@@ -85,7 +88,7 @@ typedef struct {
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
@@ -105,17 +108,17 @@ typedef struct {
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v3_t;
} ncclCollNet_v4_t;
typedef ncclCollNet_v3_t ncclCollNet_t;
typedef ncclCollNet_v4_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
#endif // end include guard
+2 -2
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+1 -15
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
return ncclSuccess;
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
return ncclSuccess;
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
#endif // NVML_DIRECT
+14
Wyświetl plik
@@ -0,0 +1,14 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVTX_H_
#define NCCL_NVTX_H_
#include "nvtx3.hpp"
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#endif
Plik diff jest za duży Load Diff
Plik diff jest za duży Load Diff
@@ -0,0 +1,141 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDA 4
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDA
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
} nvtxResourceCUDAType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The handle of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA context.
*
* Allows the user to associate a CUDA context with a user-provided name.
*
* \param context - The handle of the CUDA context to name.
* \param name - The name of the CUDA context.
*
* \par Example:
* \code
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCuDevice nvtxNameCuDeviceW
#define nvtxNameCuContext nvtxNameCuContextW
#define nvtxNameCuStream nvtxNameCuStreamW
#define nvtxNameCuEvent nvtxNameCuEventW
#else
#define nvtxNameCuDevice nvtxNameCuDeviceA
#define nvtxNameCuContext nvtxNameCuContextA
#define nvtxNameCuStream nvtxNameCuStreamA
#define nvtxNameCuEvent nvtxNameCuEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCuda_v3.h"
#undef NVTX_IMPL_GUARD_CUDA
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDA_V3 */
@@ -0,0 +1,117 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#include "driver_types.h"
#ifndef NVTOOLSEXT_CUDART_V3
#define NVTOOLSEXT_CUDART_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDART 5
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDART
*/
typedef enum nvtxResourceCUDARTType_t
{
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
} nvtxResourceCUDARTType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The id of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCudaDevice nvtxNameCudaDeviceW
#define nvtxNameCudaStream nvtxNameCudaStreamW
#define nvtxNameCudaEvent nvtxNameCudaEventW
#else
#define nvtxNameCudaDevice nvtxNameCudaDeviceA
#define nvtxNameCudaStream nvtxNameCudaStreamA
#define nvtxNameCudaEvent nvtxNameCudaEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
#undef NVTX_IMPL_GUARD_CUDART
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDART_V3 */
@@ -0,0 +1,191 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include <CL/cl.h>
#ifndef NVTOOLSEXT_OPENCL_V3
#define NVTOOLSEXT_OPENCL_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for OpenCL Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
*
* This section covers the API functions that allow to annotate OpenCL resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_OPENCL 6
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for OpenCL
*/
typedef enum nvtxResourceOpenCLType_t
{
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
} nvtxResourceOpenCLType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL device.
*
* Allows to associate an OpenCL device with a user-provided name.
*
* \param device - The handle of the OpenCL device to name.
* \param name - The name of the OpenCL device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL context.
*
* Allows to associate an OpenCL context with a user-provided name.
*
* \param context - The handle of the OpenCL context to name.
* \param name - The name of the OpenCL context.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL command queue.
*
* Allows to associate an OpenCL command queue with a user-provided name.
*
* \param command_queue - The handle of the OpenCL command queue to name.
* \param name - The name of the OpenCL command queue.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL memory object.
*
* Allows to associate an OpenCL memory object with a user-provided name.
*
* \param memobj - The handle of the OpenCL memory object to name.
* \param name - The name of the OpenCL memory object.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL sampler.
*
* Allows to associate an OpenCL sampler with a user-provided name.
*
* \param sampler - The handle of the OpenCL sampler to name.
* \param name - The name of the OpenCL sampler.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL program.
*
* Allows to associate an OpenCL program with a user-provided name.
*
* \param program - The handle of the OpenCL program to name.
* \param name - The name of the OpenCL program.
*
* \code
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
* (const char **) &cSourceCL, &program_length, &ciErrNum);
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL event.
*
* Allows to associate an OpenCL event with a user-provided name.
*
* \param evnt - The handle of the OpenCL event to name.
* \param name - The name of the OpenCL event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameClDevice nvtxNameClDeviceW
#define nvtxNameClContext nvtxNameClContextW
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
#define nvtxNameClMemObject nvtxNameClMemObjectW
#define nvtxNameClSampler nvtxNameClSamplerW
#define nvtxNameClProgram nvtxNameClProgramW
#define nvtxNameClEvent nvtxNameClEventW
#else
#define nvtxNameClDevice nvtxNameClDeviceA
#define nvtxNameClContext nvtxNameClContextA
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
#define nvtxNameClMemObject nvtxNameClMemObjectA
#define nvtxNameClSampler nvtxNameClSamplerA
#define nvtxNameClProgram nvtxNameClProgramA
#define nvtxNameClEvent nvtxNameClEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
#undef NVTX_IMPL_GUARD_OPENCL
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_OPENCL_V3 */
@@ -0,0 +1,382 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#ifndef NVTOOLSEXT_SYNC_V3
#define NVTOOLSEXT_SYNC_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* \cond SHOW_HIDDEN
* \version \NVTX_VERSION_2
*/
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
/** \endcond */
/**
* \page PAGE_SYNCHRONIZATION Synchronization
*
* This section covers a subset of the API that allow users to track additional
* synchronization details of their application. Naming OS synchronization primitives
* may allow users to better understand the data collected by traced synchronization
* APIs. Additionally, a user defined synchronization object can allow the users to
* to tell the tools when the user is building their own synchronization system
* that do not rely on the OS to provide behaviors and instead use techniques like
* atomic operations and spinlocks.
*
* See module \ref SYNCHRONIZATION for details.
*
* \par Example:
* \code
* class MyMutex
* {
* volatile long bLocked;
* nvtxSyncUser_t hSync;
* public:
* MyMutex(const char* name, nvtxDomainHandle_t d){
* bLocked = 0;
*
* nvtxSyncUserAttributes_t attribs = { 0 };
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = name;
* hSync = nvtxDomainSyncUserCreate(d, &attribs);
* }
*
* ~MyMutex() {
* nvtxDomainSyncUserDestroy(hSync);
* }
*
* bool Lock() {
* nvtxDomainSyncUserAcquireStart(hSync);
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
* if (acquired) {
* nvtxDomainSyncUserAcquireSuccess(hSync);
* }
* else {
* nvtxDomainSyncUserAcquireFailed(hSync);
* }
* return acquired;
* }
* void Unlock() {
* nvtxDomainSyncUserReleasing(hSync);
* bLocked = false;
* }
* };
* \endcode
*
* \version \NVTX_VERSION_2
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \defgroup SYNCHRONIZATION Synchronization
* See page \ref PAGE_SYNCHRONIZATION.
* @{
*/
/** \brief Resource type values for OSs with POSIX Thread API support
*/
typedef enum nvtxResourceSyncPosixThreadType_t
{
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
} nvtxResourceSyncPosixThreadType_t;
/** \brief Resource type values for Windows OSs
*/
typedef enum nvtxResourceSyncWindowsType_t
{
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
} nvtxResourceSyncWindowsType_t;
/** \brief Resource type values for Linux and Linux derived OSs such as Android
* \sa
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t
{
NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
} nvtxResourceSyncLinuxType_t;
/** \brief Resource type values for Android come from Linux.
* \sa
* ::nvtxResourceSyncLinuxType_t
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
/** \brief User Defined Synchronization Object Handle .
* \anchor SYNCUSER_HANDLE_STRUCTURE
*
* This structure is opaque to the user and is used as a handle to reference
* a user defined syncrhonization object. The tools will return a pointer through the API for the application
* to hold on it's behalf to reference the string in the future.
*
*/
typedef struct nvtxSyncUser* nvtxSyncUser_t;
/** \brief User Defined Synchronization Object Attributes Structure.
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
*
* This structure is used to describe the attributes of a user defined synchronization
* object. The layout of the structure is defined by a specific version of the tools
* extension library and can change between different versions of the Tools Extension
* library.
*
* \par Initializing the Attributes
*
* The caller should always perform the following three tasks when using
* attributes:
* <ul>
* <li>Zero the structure
* <li>Set the version field
* <li>Set the size field
* </ul>
*
* Zeroing the structure sets all the event attributes types and values
* to the default value.
*
* The version and size field are used by the Tools Extension
* implementation to handle multiple versions of the attributes structure.
*
* It is recommended that the caller use one of the following to methods
* to initialize the event attributes structure:
*
* \par Method 1: Initializing nvtxEventAttributes for future compatibility
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* \endcode
*
* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = 1;
* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
* \endcode
*
* If the caller uses Method 1 it is critical that the entire binary
* layout of the structure be configured to 0 so that all fields
* are initialized to the default value.
*
* The caller should either use both NVTX_VERSION and
* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
* and a versioned type (Method 2). Using a mix of the two methods
* will likely cause either source level incompatibility or binary
* incompatibility in the future.
*
* \par Settings Attribute Types and Values
*
*
* \par Example:
* \code
* // Initialize
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
*
* // Configure the Attributes
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = "Example";
* \endcode
*
* \sa
* ::nvtxDomainSyncUserCreate
*/
typedef struct nvtxSyncUserAttributes_v0
{
/**
* \brief Version flag of the structure.
*
* Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
* supported in this header file. This can optionally be overridden to
* another version of the tools extension library.
*/
uint16_t version;
/**
* \brief Size of the structure.
*
* Needs to be set to the size in bytes of the event attribute
* structure used to specify the event.
*/
uint16_t size;
/** \brief Message type specified in this attribute structure.
*
* Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
* "message" field.
*
* Default Value is NVTX_MESSAGE_UNKNOWN
*/
int32_t messageType; /* nvtxMessageType_t */
/** \brief Message assigned to this attribute structure.
*
* The text message that is attached to an event.
*/
nvtxMessageValue_t message;
} nvtxSyncUserAttributes_v0;
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* ------------------------------------------------------------------------- */
/** \brief Create a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param domain - Domain to own the resource
* \param attribs - A structure to assign multiple attributes to the object.
*
* \return A handle that represents the newly created user defined synchronization object.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
/* ------------------------------------------------------------------------- */
/** \brief Destroy a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of success in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of releasing a reservation on user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
/** @} */ /*END defgroup*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplSync_v3.h"
#undef NVTX_IMPL_GUARD_SYNC
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_SYNC_V3 */
@@ -0,0 +1,438 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#include <Windows.h>
#else
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
/* ---- Define macros used in this file ---- */
#define NVTX_INIT_STATE_FRESH 0
#define NVTX_INIT_STATE_STARTED 1
#define NVTX_INIT_STATE_COMPLETE 2
#ifdef NVTX_DEBUG_PRINT
#ifdef __ANDROID__
#include <android/log.h>
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
#else
#include <stdio.h>
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
#endif
#else /* !defined(NVTX_DEBUG_PRINT) */
#define NVTX_ERR(...)
#define NVTX_INFO(...)
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#ifdef __GNUC__
#pragma GCC visibility push(hidden)
#endif
/* ---- Forward declare all functions referenced in globals ---- */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
uint32_t version);
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
uint32_t exportTableId);
#include "nvtxInitDecls.h"
/* ---- Define all globals ---- */
typedef struct nvtxGlobals_t
{
volatile unsigned int initState;
NvtxExportTableCallbacks etblCallbacks;
NvtxExportTableVersionInfo etblVersionInfo;
/* Implementation function pointers */
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
/* Tables of function pointers -- Extra null added to the end to ensure
* a crash instead of silent corruption if a tool reads off the end. */
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
} nvtxGlobals_t;
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
{
NVTX_INIT_STATE_FRESH,
{
sizeof(NvtxExportTableCallbacks),
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
},
{
sizeof(NvtxExportTableVersionInfo),
NVTX_VERSION,
0,
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
},
/* Implementation function pointers */
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
/* Tables of function pointers */
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
0
}
};
/* ---- Define static inline implementations of core API functions ---- */
#include "nvtxImplCore.h"
/* ---- Define implementations of export table functions ---- */
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size)
{
unsigned int bytes = 0;
NvtxFunctionTable table = (NvtxFunctionTable)0;
switch (module)
{
case NVTX_CB_MODULE_CORE:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
break;
case NVTX_CB_MODULE_CUDA:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
break;
case NVTX_CB_MODULE_OPENCL:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
break;
case NVTX_CB_MODULE_CUDART:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
break;
case NVTX_CB_MODULE_CORE2:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
break;
case NVTX_CB_MODULE_SYNC:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
break;
default: return 0;
}
if (out_size)
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
if (out_table)
*out_table = table;
return 1;
}
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
{
switch (exportTableId)
{
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
default: return 0;
}
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
{
/* Reserved for custom implementations to resolve problems with tools */
(void)version;
}
/* ---- Define implementations of init versions of all API functions ---- */
#include "nvtxInitDefs.h"
/* ---- Define implementations of initialization functions ---- */
#include "nvtxInit.h"
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,307 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
if(local!=0)
(*local)(eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
if(local!=0)
(*local)(id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
{
#ifndef NVTX_DISABLE
nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
if(local!=0)
return (*local)();
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
if(local!=0)
(*local)(domain, eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
if(local!=0)
(*local)(domain, id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
if(local!=0)
return (*local)(domain);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxResourceHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
if(local!=0)
(*local)(resource);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
if(local!=0)
(*local)(domain);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
{
#ifndef NVTX_DISABLE
nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
if(local!=0)
(*local)(reserved);
#endif /*NVTX_DISABLE*/
}
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDART
#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,102 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDA
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,161 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_OPENCL
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,83 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_SYNC
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxSyncUser_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,312 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Platform-independent helper definitions and functions ---- */
/* Prefer macros over inline functions to reduce symbol resolution at link time */
#if defined(_WIN32)
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
#define NVTX_DLLCLOSE FreeLibrary
#define NVTX_YIELD() SwitchToThread()
#define NVTX_MEMBAR() MemoryBarrier()
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
#elif defined(__GNUC__)
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
#define NVTX_DLLCLOSE dlclose
#define NVTX_YIELD() sched_yield()
#define NVTX_MEMBAR() __sync_synchronize()
/* Ensure full memory barrier for atomics, to match Windows functions */
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#else
#error The library does not support your configuration!
#endif
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
#if defined(_WIN32)
/* TODO */
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#else
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#endif
/* Define this to 1 for platforms that support environment variables */
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
#define NVTX_SUPPORT_ENV_VARS 1
/* Define this to 1 for platforms that support dynamic/shared libraries */
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
* and this will override any dynamic injection. Useful for platforms where dynamic
* injection is not available. Since weak symbols not explicitly marked extern are
* guaranteed to be initialized to zero if no definitions are found by the linker, the
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
* does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
* injection library. */
__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
#else
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
#endif
/* This function tries to find or load an NVTX injection library and get the
* address of its InitializeInjection2 function. If such a function pointer
* is found, it is called, and passed the address of this NVTX instance's
* nvtxGetExportTable function, so the injection can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will
* be freed, and all NVTX implementation functions will be set to no-ops. If
* initialization succeeds, NVTX functions not attached to the tool will be set
* to no-ops. This is implemented as one function instead of several small
* functions to minimize the number of weak symbols the linker must resolve.
* Order of search is:
* - Pre-injected library exporting InitializeInjectionNvtx2
* - Loadable library exporting InitializeInjectionNvtx2
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
{
const char* const initFuncName = "InitializeInjectionNvtx2";
NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
int entryPointStatus = 0;
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
/* Use POSIX global symbol chain to query for init function from any module */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
#endif
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
/* Try discovering dynamic injection library to load */
if (!init_fnptr)
{
#if NVTX_SUPPORT_ENV_VARS
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
? NVTX_STR("NVTX_INJECTION32_PATH")
: NVTX_STR("NVTX_INJECTION64_PATH");
#endif /* NVTX_SUPPORT_ENV_VARS */
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
/* Refer to this variable explicitly in case all references to it are #if'ed out */
(void)injectionLibraryPathBuf;
#if NVTX_SUPPORT_ENV_VARS
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
* these functions are not called again before using the returned value. */
#if defined(_MSC_VER)
#pragma warning( push )
#pragma warning( disable : 4996 )
#endif
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
#if defined(_MSC_VER)
#pragma warning( pop )
#endif
#endif
#if defined(__ANDROID__)
if (!injectionLibraryPath)
{
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
char cmdlineBuf[32];
char pkgName[PATH_MAX];
int count;
int pid;
FILE *fp;
size_t bytesRead;
size_t pos;
pid = (int)getpid();
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
{
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
fp = fopen(cmdlineBuf, "r");
if (!fp)
{
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
fclose(fp);
if (bytesRead == 0)
{
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
pkgName[bytesRead] = 0;
/* String can contain colon as a process separator. In this case the package name is before the colon. */
pos = 0;
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
{
++pos;
}
pkgName[pos] = 0;
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
if (count <= 0 || count >= NVTX_BUFSIZE)
{
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
/* On Android, verify path is accessible due to aggressive file access restrictions. */
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
if (injectionLibraryPathBuf[0] == '/')
{
#if (__ANDROID_API__ < 21)
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
#else
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
#endif
if (access_err != 0)
{
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
}
injectionLibraryPath = injectionLibraryPathBuf;
}
#endif
/* At this point, injectionLibraryPath is specified if a dynamic
* injection library was specified by a tool. */
if (injectionLibraryPath)
{
/* Load the injection library */
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
if (!injectionLibraryHandle)
{
NVTX_ERR("Failed to load injection library\n");
return NVTX_ERR_INIT_LOAD_LIBRARY;
}
else
{
/* Attempt to get the injection library's entry-point */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
if (!init_fnptr)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
}
}
}
}
#endif
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
if (!init_fnptr)
{
/* Check weakly-defined function pointer. A statically-linked injection can define this as
* a normal symbol and it will take precedence over a dynamic injection. */
if (InitializeInjectionNvtx2_fnptr)
{
init_fnptr = InitializeInjectionNvtx2_fnptr;
}
}
#endif
/* At this point, if init_fnptr is not set, then no tool has specified
* an NVTX injection library -- return non-success result so all NVTX
* API functions will be set to no-ops. */
if (!init_fnptr)
{
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
}
/* Invoke injection library's initialization function. If it returns
* 0 (failure) and a dynamic injection was loaded, unload it. */
entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
if (entryPointStatus == 0)
{
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
if (injectionLibraryHandle)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
}
return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
}
return NVTX_SUCCESS;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
{
unsigned int old;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
{
return;
}
NVTX_ATOMIC_CAS_32(
old,
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_STARTED,
NVTX_INIT_STATE_FRESH);
if (old == NVTX_INIT_STATE_FRESH)
{
int result;
int forceAllToNoops;
/* Load & initialize injection library -- it will assign the function pointers */
result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
/* Set all pointers not assigned by the injection to null */
forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
/* Signal that initialization has finished, so now the assigned function pointers will be used */
NVTX_ATOMIC_WRITE_32(
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_COMPLETE);
}
else /* Spin-wait until initialization has finished */
{
NVTX_MEMBAR();
while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
{
NVTX_YIELD();
NVTX_MEMBAR();
}
}
}
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
@@ -0,0 +1,573 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxRangeEnd(id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePop();
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryA(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryW(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadA(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadW(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainMarkEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangeStartEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainRangeEnd(domain, id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePushEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePop(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainResourceCreate(domain, attribs);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainResourceDestroy(resource);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryA(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryW(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringA(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringW(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainDestroy(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxInitialize(reserved);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
nvtxNameCuDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
nvtxNameCuDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
nvtxNameCuContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
nvtxNameCuContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
nvtxNameCuStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
nvtxNameCuStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
nvtxNameCuEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
nvtxNameCuEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
nvtxNameCudaDeviceA_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
nvtxNameCudaDeviceW_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
nvtxNameCudaStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
nvtxNameCudaStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
nvtxNameCudaEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
nvtxNameCudaEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
nvtxNameClDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
nvtxNameClDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
nvtxNameClContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
nvtxNameClContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
nvtxNameClCommandQueueA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
nvtxNameClCommandQueueW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
nvtxNameClMemObjectA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
nvtxNameClMemObjectW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
nvtxNameClSamplerA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
nvtxNameClSamplerW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
nvtxNameClProgramA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
nvtxNameClProgramW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
nvtxNameClEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
nvtxNameClEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
nvtxDomainSyncUserCreate_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if (local) {
return local(domain, attribs);
}
return (nvtxSyncUser_t)0;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserDestroy_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireStart_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireFailed_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserReleasing_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
{
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
}
@@ -0,0 +1,83 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef __NVTX_LINKONCE_H__
#define __NVTX_LINKONCE_H__
/* This header defines macros to permit making definitions of global variables
* and functions in C/C++ header files which may be included multiple times in
* a translation unit or linkage unit. It allows authoring header-only libraries
* which can be used by multiple other header-only libraries (either as the same
* copy or multiple copies), and does not require any build changes, such as
* adding another .c file, linking a static library, or deploying a dynamic
* library. Globals defined with these macros have the property that they have
* the same address, pointing to a single instance, for the entire linkage unit.
* It is expected but not guaranteed that each linkage unit will have a separate
* instance.
*
* In some situations it is desirable to declare a variable without initializing
* it, refer to it in code or other variables' initializers, and then initialize
* it later. Similarly, functions can be prototyped, have their address taken,
* and then have their body defined later. In such cases, use the FWDDECL macros
* when forward-declaring LINKONCE global variables without initializers and
* function prototypes, and then use the DEFINE macros when later defining them.
* Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
* following this pattern makes code maximally portable.
*/
#if defined(__MINGW32__) /* MinGW */
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#elif defined(_MSC_VER) /* MSVC */
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION __inline
#endif
#elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#elif defined(__CYGWIN__) /* Assume GCC or compatible */
#define NVTX_LINKONCE_WEAK __attribute__((weak))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#else /* All others: Assume GCC, clang, or compatible */
#define NVTX_LINKONCE_WEAK __attribute__((weak))
#define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#endif
#endif
#define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern
#define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
#endif /* __NVTX_LINKONCE_H__ */
@@ -0,0 +1,304 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* This header defines types which are used by the internal implementation
* of NVTX and callback subscribers. API clients do not use these types,
* so they are defined here instead of in nvToolsExt.h to clarify they are
* not part of the NVTX client API. */
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h.
#endif
/* ------ Dependency-free types binary-compatible with real types ------- */
/* In order to avoid having the NVTX core API headers depend on non-NVTX
* headers like cuda.h, NVTX defines binary-compatible types to use for
* safely making the initialization versions of all NVTX functions without
* needing to have definitions for the real types. */
typedef int nvtx_CUdevice;
typedef void* nvtx_CUcontext;
typedef void* nvtx_CUstream;
typedef void* nvtx_CUevent;
typedef void* nvtx_cudaStream_t;
typedef void* nvtx_cudaEvent_t;
typedef void* nvtx_cl_platform_id;
typedef void* nvtx_cl_device_id;
typedef void* nvtx_cl_context;
typedef void* nvtx_cl_command_queue;
typedef void* nvtx_cl_mem;
typedef void* nvtx_cl_program;
typedef void* nvtx_cl_kernel;
typedef void* nvtx_cl_event;
typedef void* nvtx_cl_sampler;
typedef struct nvtxSyncUser* nvtxSyncUser_t;
struct nvtxSyncUserAttributes_v0;
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* --------- Types for function pointers (with fake API types) ---------- */
typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message);
typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message);
typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id);
typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message);
typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message);
typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void);
typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name);
typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name);
typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name);
typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name);
/* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */
typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name);
/* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */
typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name);
typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name);
typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name);
typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name);
typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name);
typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name);
typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name);
typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name);
/* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name);
typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain);
typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource);
typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string);
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string);
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message);
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message);
typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain);
typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved);
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
/* ---------------- Types for callback subscription --------------------- */
typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId);
typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable);
typedef enum NvtxCallbackModule
{
NVTX_CB_MODULE_INVALID = 0,
NVTX_CB_MODULE_CORE = 1,
NVTX_CB_MODULE_CUDA = 2,
NVTX_CB_MODULE_OPENCL = 3,
NVTX_CB_MODULE_CUDART = 4,
NVTX_CB_MODULE_CORE2 = 5,
NVTX_CB_MODULE_SYNC = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CB_MODULE_SIZE,
NVTX_CB_MODULE_FORCE_INT = 0x7fffffff
} NvtxCallbackModule;
typedef enum NvtxCallbackIdCore
{
NVTX_CBID_CORE_INVALID = 0,
NVTX_CBID_CORE_MarkEx = 1,
NVTX_CBID_CORE_MarkA = 2,
NVTX_CBID_CORE_MarkW = 3,
NVTX_CBID_CORE_RangeStartEx = 4,
NVTX_CBID_CORE_RangeStartA = 5,
NVTX_CBID_CORE_RangeStartW = 6,
NVTX_CBID_CORE_RangeEnd = 7,
NVTX_CBID_CORE_RangePushEx = 8,
NVTX_CBID_CORE_RangePushA = 9,
NVTX_CBID_CORE_RangePushW = 10,
NVTX_CBID_CORE_RangePop = 11,
NVTX_CBID_CORE_NameCategoryA = 12,
NVTX_CBID_CORE_NameCategoryW = 13,
NVTX_CBID_CORE_NameOsThreadA = 14,
NVTX_CBID_CORE_NameOsThreadW = 15,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CORE_SIZE,
NVTX_CBID_CORE_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCore;
typedef enum NvtxCallbackIdCore2
{
NVTX_CBID_CORE2_INVALID = 0,
NVTX_CBID_CORE2_DomainMarkEx = 1,
NVTX_CBID_CORE2_DomainRangeStartEx = 2,
NVTX_CBID_CORE2_DomainRangeEnd = 3,
NVTX_CBID_CORE2_DomainRangePushEx = 4,
NVTX_CBID_CORE2_DomainRangePop = 5,
NVTX_CBID_CORE2_DomainResourceCreate = 6,
NVTX_CBID_CORE2_DomainResourceDestroy = 7,
NVTX_CBID_CORE2_DomainNameCategoryA = 8,
NVTX_CBID_CORE2_DomainNameCategoryW = 9,
NVTX_CBID_CORE2_DomainRegisterStringA = 10,
NVTX_CBID_CORE2_DomainRegisterStringW = 11,
NVTX_CBID_CORE2_DomainCreateA = 12,
NVTX_CBID_CORE2_DomainCreateW = 13,
NVTX_CBID_CORE2_DomainDestroy = 14,
NVTX_CBID_CORE2_Initialize = 15,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CORE2_SIZE,
NVTX_CBID_CORE2_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCore2;
typedef enum NvtxCallbackIdCuda
{
NVTX_CBID_CUDA_INVALID = 0,
NVTX_CBID_CUDA_NameCuDeviceA = 1,
NVTX_CBID_CUDA_NameCuDeviceW = 2,
NVTX_CBID_CUDA_NameCuContextA = 3,
NVTX_CBID_CUDA_NameCuContextW = 4,
NVTX_CBID_CUDA_NameCuStreamA = 5,
NVTX_CBID_CUDA_NameCuStreamW = 6,
NVTX_CBID_CUDA_NameCuEventA = 7,
NVTX_CBID_CUDA_NameCuEventW = 8,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CUDA_SIZE,
NVTX_CBID_CUDA_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCuda;
typedef enum NvtxCallbackIdCudaRt
{
NVTX_CBID_CUDART_INVALID = 0,
NVTX_CBID_CUDART_NameCudaDeviceA = 1,
NVTX_CBID_CUDART_NameCudaDeviceW = 2,
NVTX_CBID_CUDART_NameCudaStreamA = 3,
NVTX_CBID_CUDART_NameCudaStreamW = 4,
NVTX_CBID_CUDART_NameCudaEventA = 5,
NVTX_CBID_CUDART_NameCudaEventW = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CUDART_SIZE,
NVTX_CBID_CUDART_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCudaRt;
typedef enum NvtxCallbackIdOpenCL
{
NVTX_CBID_OPENCL_INVALID = 0,
NVTX_CBID_OPENCL_NameClDeviceA = 1,
NVTX_CBID_OPENCL_NameClDeviceW = 2,
NVTX_CBID_OPENCL_NameClContextA = 3,
NVTX_CBID_OPENCL_NameClContextW = 4,
NVTX_CBID_OPENCL_NameClCommandQueueA = 5,
NVTX_CBID_OPENCL_NameClCommandQueueW = 6,
NVTX_CBID_OPENCL_NameClMemObjectA = 7,
NVTX_CBID_OPENCL_NameClMemObjectW = 8,
NVTX_CBID_OPENCL_NameClSamplerA = 9,
NVTX_CBID_OPENCL_NameClSamplerW = 10,
NVTX_CBID_OPENCL_NameClProgramA = 11,
NVTX_CBID_OPENCL_NameClProgramW = 12,
NVTX_CBID_OPENCL_NameClEventA = 13,
NVTX_CBID_OPENCL_NameClEventW = 14,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_OPENCL_SIZE,
NVTX_CBID_OPENCL_FORCE_INT = 0x7fffffff
} NvtxCallbackIdOpenCL;
typedef enum NvtxCallbackIdSync
{
NVTX_CBID_SYNC_INVALID = 0,
NVTX_CBID_SYNC_DomainSyncUserCreate = 1,
NVTX_CBID_SYNC_DomainSyncUserDestroy = 2,
NVTX_CBID_SYNC_DomainSyncUserAcquireStart = 3,
NVTX_CBID_SYNC_DomainSyncUserAcquireFailed = 4,
NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5,
NVTX_CBID_SYNC_DomainSyncUserReleasing = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_SYNC_SIZE,
NVTX_CBID_SYNC_FORCE_INT = 0x7fffffff
} NvtxCallbackIdSync;
/* IDs for NVTX Export Tables */
typedef enum NvtxExportTableID
{
NVTX_ETID_INVALID = 0,
NVTX_ETID_CALLBACKS = 1,
NVTX_ETID_RESERVED0 = 2,
NVTX_ETID_VERSIONINFO = 3,
/* --- New constants must only be added directly above this line --- */
NVTX_ETID_SIZE,
NVTX_ETID_FORCE_INT = 0x7fffffff
} NvtxExportTableID;
typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */
typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */
typedef struct NvtxExportTableCallbacks
{
size_t struct_size;
/* returns an array of pointer to function pointers*/
int (NVTX_API *GetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size);
} NvtxExportTableCallbacks;
typedef struct NvtxExportTableVersionInfo
{
/* sizeof(NvtxExportTableVersionInfo) */
size_t struct_size;
/* The API version comes from the NVTX library linked to the app. The
* injection library is can use this info to make some assumptions */
uint32_t version;
/* Reserved for alignment, do not use */
uint32_t reserved0;
/* This must be set by tools when attaching to provide applications
* the ability to, in emergency situations, detect problematic tools
* versions and modify the NVTX source to prevent attaching anything
* that causes trouble in the app. Currently, this value is ignored. */
void (NVTX_API *SetInjectionNvtxVersion)(
uint32_t version);
} NvtxExportTableVersionInfo;
+15
Wyświetl plik
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVTX_STUB_H_
#define NCCL_NVTX_STUB_H_
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#define NVTX3_FUNC_RANGE_IN(domain)
#define nvtxNameOsThreadA(syscall, thread)
#endif
+25 -14
Wyświetl plik
@@ -10,23 +10,34 @@
#define NCCL_P2P_H_
struct ncclP2Pinfo {
const void* sendbuff;
void* recvbuff;
ssize_t sendbytes;
ssize_t recvbytes;
};
struct ncclP2PConnect {
int nrecv[MAXCHANNELS];
int nsend[MAXCHANNELS];
int* recv;
int* send;
void* buff;
ssize_t nbytes;
struct ncclP2Pinfo* next;
};
struct ncclP2Plist {
struct ncclP2Pinfo *peerlist;
int count;
struct ncclP2PConnect connect;
struct ncclP2Pinfo *head;
struct ncclP2Pinfo *tail;
};
static ncclResult_t enqueueP2pInfo(ncclP2Plist* p2p, void* buff, ssize_t nBytes) {
if (p2p == NULL) return ncclInternalError;
struct ncclP2Pinfo* next;
NCCLCHECK(ncclCalloc(&next, 1));
next->buff = buff;
next->nbytes = nBytes;
if (p2p->tail != NULL) p2p->tail->next = next;
p2p->tail = next;
if (p2p->head == NULL) p2p->head = next;
return ncclSuccess;
}
static ncclResult_t dequeueP2pInfo(ncclP2Plist* p2p) {
if (p2p == NULL) return ncclInternalError;
struct ncclP2Pinfo* temp = p2p->head;
p2p->head = p2p->head->next;
if (p2p->tail == temp) p2p->tail = NULL;
free(temp);
return ncclSuccess;
}
#endif
+3 -2
Wyświetl plik
@@ -31,10 +31,11 @@ static void setEnvFile(const char* fileName) {
int s=0; // Env Var Size
while (line[s] != '\0' && line[s] != '=') s++;
if (line[s] == '\0') continue;
strncpy(envVar, line, std::min(1024,s));
strncpy(envVar, line, std::min(1023,s));
envVar[s] = '\0';
s++;
strncpy(envValue, line+s, 1024);
strncpy(envValue, line+s, 1023);
envValue[1023]='\0';
setenv(envVar, envValue, 0);
}
if (line) free(line);
+30 -5
Wyświetl plik
@@ -18,18 +18,23 @@ struct ncclProxyArgs {
proxyProgressFunc_t progress;
struct ncclChannel* channel;
struct ncclConnector* connector;
size_t sendbytes;
size_t recvbytes;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int protocol;
int segment; // Only for profiling
ncclDataType_t dtype;
ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
// Internal state
uint64_t head;
uint64_t tail;
uint64_t posted;
uint64_t received; // Only used by recv proxy to wait for flush.
uint64_t transmitted;
uint64_t done;
uint64_t end;
void* requests[NCCL_STEPS];
int idle;
@@ -38,14 +43,30 @@ struct ncclProxyArgs {
pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
struct ncclProxyArgs* nextGroup;
struct ncclProxyArgs** proxyAppendPtr;
};
struct ncclProxySharedBuffers {
int nslots;
int slotSize;
char* cudaBuff[2*MAXCHANNELS];
int* cudaUsed[2*MAXCHANNELS];
char* hostBuff[2*MAXCHANNELS];
int* hostUsed[2*MAXCHANNELS];
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
};
struct ncclProxyPool;
struct ncclProxyState {
pthread_cond_t cond;
pthread_mutex_t mutex;
pthread_mutex_t opsMutex;
pthread_mutex_t poolMutex;
bool stop;
struct ncclProxySharedBuffers* sharedBuffs;
struct ncclProxyArgs* ops;
struct ncclProxyArgs* nextOps;
struct ncclProxyArgs* nextOpsEnd;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
};
@@ -59,12 +80,16 @@ enum proxyMode {
};
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
#include <unistd.h>
// Spin wait until func evaluates to true
+12 -9
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -21,6 +21,7 @@
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -64,7 +65,7 @@ static inline int envSocketFamily(void) {
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
@@ -167,9 +168,9 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
#endif
char line_a[1024];
char line_a[SOCKET_NAME_MAXLEN+1];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
@@ -355,7 +356,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif
@@ -370,6 +371,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
/* IPv4/IPv6 support */
int family = remoteAddr->sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
return ncclInternalError;
}
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
/* Connect to a hostname / port */
@@ -386,10 +391,8 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
char line[1024];
#ifdef ENABLE_TRACE
char line[SOCKET_NAME_MAXLEN+1];
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
#endif
int ret;
int timedout_retries = 0;
@@ -450,7 +453,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
return ncclSuccess;
}
static ncclResult_t socketReceive(int fd, void* ptr, int size) {
static ncclResult_t socketRecv(int fd, void* ptr, int size) {
int offset = 0;
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, ptr, size, &offset));
return ncclSuccess;
+4 -3
Wyświetl plik
@@ -41,8 +41,8 @@ struct ncclConnect {
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
};
@@ -54,6 +54,7 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
#endif
+3 -3
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_TREES_H_
#define NCCL_TREES_H_
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
#endif
+169 -153
Wyświetl plik
@@ -41,7 +41,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "Gather", "Scatter", "AllToAll", "AllToAllv" };
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
@@ -160,47 +160,67 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
#ifdef ENABLE_COLLTRACE
void *ncclCommThreadMain(void *arg) {
ncclComm_t comm = (ncclComm_t)arg;
int head = comm->hostDevComm.collTraceHead;
do {
int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
int head = comm->hostDevComm.collTraceHead;
int count;
if (head <= tail)
count = tail - head;
else
count = COLLTRACE_NUM_ITEMS + head - tail;
usleep(1000); //sleep 1ms
if (!count) {
if(LOAD(&comm->hostDevComm.collTraceExit))
break;
else {
usleep(1000); //sleep 1ms
continue;
}
}
for (int i = 0; i < count; i++) {
uint8_t type = LOAD(&(comm->hostDevComm.collTrace[head].type));
if (type == ncclCollTraceNotReady)
break;
char line[1024];
int offset = 0;
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
offset = strlen(line);
switch (comm->hostDevComm.collTrace[head].type) {
case ncclCollTraceKernelLaunchType:
sprintf(line+offset, " KL hwid %8x funcIndex %d",
comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
break;
case ncclCollTraceCollEndType:
if (comm->hostDevComm.collTrace[head].funcIndex != -1)
sprintf(line+offset, " CE next funcIndex %d",
comm->hostDevComm.collTrace[head].funcIndex);
else
sprintf(line+offset, " KE");
break;
case ncclCollTraceAbortType:
sprintf(line+offset, " Abort");
break;
default:
sprintf(line+offset, " unknown collective trace data type");
break;
if (type == ncclCollTraceDataType) {
sprintf(line, "## [%12.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid,
comm->hostDevComm.collTrace[head].funcIndex,
comm->hostDevComm.collTrace[head].data_0,
comm->hostDevComm.collTrace[head].opCount,
comm->hostDevComm.collTrace[head].data_1);
} else {
sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
(double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
offset = strlen(line);
switch (type) {
case ncclCollTraceKernelLaunchType:
sprintf(line+offset, " KL hwid %8x funcIndex %d",
comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
break;
case ncclCollTraceCollEndType:
if (comm->hostDevComm.collTrace[head].funcIndex != -1)
sprintf(line+offset, " CE next funcIndex %d",
comm->hostDevComm.collTrace[head].funcIndex);
else
sprintf(line+offset, " KE");
break;
case ncclCollTraceAbortType:
sprintf(line+offset, " Abort");
break;
default:
sprintf(line+offset, " unknown collective trace data type");
break;
}
}
INFO(NCCL_COLL, "%s", line);
STORE(&(comm->hostDevComm.collTrace[head].type), ncclCollTraceNotReady);
head ++;
head %= COLLTRACE_NUM_ITEMS;
}
comm->hostDevComm.collTraceHead = tail;
} while(!LOAD(&comm->hostDevComm.collTraceExit));
} while(1);
comm->hostDevComm.collTraceHead = head;
pthread_exit(NULL);
}
#endif
@@ -210,9 +230,11 @@ void *ncclCommThreadMain(void *arg) {
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
free(comm->p2plist.peerlist);
free(comm->p2plist.connect.recv);
free(comm->p2plist.connect.send);
free(comm->connectSend);
free(comm->connectRecv);
free(comm->p2pSends);
free(comm->p2pRecvs);
free(comm->asyncOps);
#ifdef ENABLE_PROFILING
struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
@@ -290,7 +312,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->intraCGMode);
free(comm->intraCC);
}
CUDACHECK(hipHostFree((void *)comm->abortFlag));
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
// Poison comm to try and catch a double free
commPoison(comm);
@@ -319,7 +341,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
comm->rank = comm->hostDevComm.rank =rank;
comm->rank = comm->hostDevComm.rank = rank;
comm->nRanks = comm->hostDevComm.nRanks = ndev;
hipGetDevice(&comm->cudaDev);
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
@@ -355,17 +377,25 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->hostDevComm.collTraceThread = 0;
#endif
comm->collNetSupport = 0;
comm->p2plist.count=0;
NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
comm->p2pSendCount = comm->p2pRecvCount = 0;
NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
// Mark channels as non initialized.
for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
comm->alltoallDisable = false;
if (rcclParamAllToAllDisable()) comm->alltoallDisable = true;
comm->alltoallDisable = true;
//if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;
*comret = comm;
return ncclSuccess;
@@ -373,11 +403,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Duplicate the channels on the device
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
// Copy userRanks and peers
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
for (int r=0; r<comm->p2pnChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
}
@@ -449,7 +479,7 @@ void* waitForNonNullPtr(void* p) {
ncclResult_t initParams(struct ncclComm* comm) {
hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
params->args =(void **)&comm->argsptr;
params->args = (void **)&comm->argsptr;
params->stream = NULL;
params->sharedMem = 0;
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -518,8 +548,8 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@@ -532,10 +562,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
if (comm->nRanks >= 32) {
defaults[NCCL_PROTO_SIMPLE] = 524288;
INFO(NCCL_INIT, "Setting DEFAULT_BUFFSIZE to %d for nRanks %d", defaults[NCCL_PROTO_SIMPLE], comm->nRanks);
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
@@ -581,7 +607,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
// setup
struct ncclConnect myConnect;
if (isMaster && ret > 0) {
NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
}
// prepare connect handles
ncclResult_t res;
@@ -611,7 +637,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
}
// connect
if (isMaster && ret > 0) {
NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
@@ -669,10 +695,9 @@ NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
// We use 3 AllGathers
// 1. { peerInfo, comm }
// 2. ConnectTransport[nranks], ConnectValue[nranks]
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
// We use 2 AllGathers
// 1. { peerInfo, comm, compCap}
// 2. { nChannels, graphInfo, topoRanks }
int rank = comm->rank;
int nranks = comm->nRanks;
@@ -684,10 +709,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct {
struct ncclPeerInfo peerInfo;
struct ncclComm* comm;
int cudaCompCap;
} *allGather1Data;
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
allGather1Data[rank].comm = comm;
allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
NCCLCHECK(fillInfo(comm, myInfo, commHash));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
@@ -700,7 +727,40 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
return ncclInvalidUsage;
}
}
// AllGather1 data is used again below
// Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
int myCompCap = allGather1Data[rank].cudaCompCap;
int minCompCap = myCompCap, maxCompCap = myCompCap;
uint64_t otherHostHash;
int tmpNnodes = 1;
for (int i = 0; i < nranks; i++) {
if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
if (intraRanks == 0) intraRank0 = i;
if (i == rank) intraRank = intraRanks;
intraRanks++;
}
} else { // Determine whether number of nodes is 2 (for use in tree pattern determination)
if (tmpNnodes == 1) {
otherHostHash = allGather1Data[i].peerInfo.hostHash;
tmpNnodes = 2;
} else if (tmpNnodes == 2 && otherHostHash != allGather1Data[i].peerInfo.hostHash) {
tmpNnodes = 3;
}
}
minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
}
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm;
// AllGather1 - end
// Topo detection / System graph creation
@@ -729,7 +789,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct ncclTopoGraph treeGraph;
treeGraph.id = 1;
treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
treeGraph.pattern = tmpNnodes <= 2 ? NCCL_TOPO_PATTERN_TREE : NCCL_TOPO_PATTERN_BALANCED_TREE;
treeGraph.crossNic = ncclParamCrossNic();
treeGraph.collNet = 0;
treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
@@ -753,10 +813,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// AllGather3 - begin
struct ncclGraphInfo {
int pattern;
int sameChannels;
float speedIntra;
float speedInter;
int typeIntra;
int typeInter;
};
struct {
@@ -776,29 +838,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
allGather3Data[rank].ring.pattern = ringGraph.pattern;
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
// Determine nNodes, firstRanks, ...
int* nodesFirstRank;
int *nodesFirstRank, *nodesTreePatterns;
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
for (int i=0; i<nranks; i++) {
int node = -1;
int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
@@ -808,18 +878,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
if (node == -1) {
node = comm->nNodes++;
nodesFirstRank[node] = firstRank;
// Record tree pattern of each node as they can be different depending on sm arch
nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
}
if (i == comm->rank) comm->node = node;
}
// Determine the minimum CUDA Compute capability of all GPUs
int myCompCap = allGather3Data[rank].cudaCompCap;
int minCompCap = myCompCap, maxCompCap = myCompCap;
for (int i = 0; i < nranks; i++) {
minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
}
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -835,15 +899,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
}
if (comm->alltoallDisable != alltoallDisable) {
comm->alltoallDisable = alltoallDisable;
}
@@ -873,7 +941,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, gcn, nNets));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
@@ -881,23 +949,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
}
free(allTopoRanks);
free(nodesTreePatterns);
free(nodesFirstRank);
free(allGather1Data);
free(allGather3Data);
// AllGather3 - end
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
char line[1024];
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* treeUp = &comm->channels[c].treeUp;
struct ncclTree* treeDn = &comm->channels[c].treeDn;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
}
line[1023] = '\0';
@@ -913,16 +979,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(computeBuffSizes(comm));
// Connect with prev/next for each ring
struct ncclConnect *connect;
NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all rings");
// Connect Trees
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all trees");
// Check if we can setup CollNet
if (comm->nNodes > 1 &&
@@ -935,8 +1009,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int c=0; c<logicChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
@@ -944,82 +1018,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
collNetSetupFail = 1;
}
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
// Verify CollNet setup across ranks
NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(connect);
free(rings);
// Compute time models for algorithm and protocol combinations
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
// Compute nChannels per peer for p2p
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
if (!alltoallDisable) {
int nc = comm->nChannels;
if (comm->topo->type == RCCL_TOPO_4P2H_ROME)
nc = 2;
for (int c=0; c<nc; c++) {
const int peersPerChan = DIVUP(nranks, nc);
struct ncclP2PConnect* connect = &comm->p2plist.connect;
connect->nrecv[c] = 0;
connect->nsend[c] = 0;
for (int p=0; p<peersPerChan; p++) {
// first channel is reserved for self copy
if ((c*peersPerChan+p)%nranks == 0)
continue;
int peerSend = (rank+c*peersPerChan+p)%nranks;
int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
if (comm->channels[c].peers[peerSend].send.connected == 0) {
connect->send[c*nranks+connect->nsend[c]++] = peerSend;
}
if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
connect->recv[c*nranks+connect->nrecv[c]++] = peerRecv;
}
}
}
for (int c=0; c<nc; c++) {
struct ncclChannel* channel = comm->channels+c;
struct ncclP2PConnect* connect = &comm->p2plist.connect;
#if 0
printf("channel %d recv: ", c);
for (int i=0; i<connect->nrecv[c]; i++)
printf("%d ", connect->recv[c*nranks+i]);
printf("\n");
printf("channel %d send: ", c);
for (int i=0; i<connect->nsend[c]; i++)
printf("%d ", connect->send[c*nranks+i]);
printf("\n");
#endif
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*nranks, connect->nsend[c], connect->send+c*nranks));
connect->nrecv[c] = 0;
connect->nsend[c] = 0;
}
}
// Compute intra ranks (using AllGather1 data)
do {
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
for (int i = 0; i < nranks; i++) {
if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
(allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
if (intraRanks == 0) intraRank0 = i;
if (i == rank) intraRank = intraRanks;
intraRanks++;
}
}
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
} while(0);
// Done with AllGather1 data
free(allGather1Data);
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
@@ -1083,6 +1095,7 @@ end:
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
@@ -1091,6 +1104,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
if (ndev < 0) {
WARN("Invalid device count requested : %d", ndev);
@@ -1110,9 +1124,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
static ncclResult_t commDestroy(ncclComm_t comm) {
int savedDevice;
#ifdef ENABLE_TRACE
int rank = comm->rank;
#endif
CUDACHECK(hipGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
@@ -1120,7 +1131,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
CUDACHECK(hipSetDevice(commDevice));
}
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, LOAD(comm->abortFlag), comm->fatalError);
CUDACHECK(hipStreamSynchronize(comm->groupStream));
NCCLCHECK(ncclProxyDestroy(comm));
@@ -1129,13 +1140,14 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
if (savedDevice != commDevice)
CUDACHECK(hipSetDevice(savedDevice));
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (comm == NULL)
return ncclSuccess;
@@ -1152,6 +1164,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
ncclResult_t ncclCommAbort(ncclComm_t comm) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (comm == NULL)
return ncclSuccess;
@@ -1186,6 +1199,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
*count = comm->nRanks;
@@ -1194,6 +1208,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
*devid = comm->cudaDev;
@@ -1202,6 +1217,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
*rank = comm->rank;
+6 -13
Wyświetl plik
@@ -46,26 +46,19 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
}
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast
|| info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll) {
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
info->count = info->nBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter
|| info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll)
info->nBytes *= info->comm->nRanks; // count is per rank
if (info->coll == ncclCollAllToAllv) {
// Use count to store data type size for alltoallv
info->count = ncclTypeSize(info->datatype);
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
if (info->op < 0 || info->op >= ncclNumOps) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
return ncclInvalidArgument;
}
if (info->comm->checkPointers) {
if (info->coll == ncclCollSendRecv) {
if (info->coll == ncclFuncSendRecv) {
if (strcmp(info->opName, "Send") == 0) {
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
} else {
@@ -73,10 +66,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
}
} else {
// Check CUDA device pointers
if ((info->coll != ncclCollBroadcast && info->coll != ncclCollScatter) || info->comm->rank == info->root) {
if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
}
if ((info->coll != ncclCollReduce && info->coll != ncclCollGather) || info->comm->rank == info->root) {
if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
}
}
+1 -55
Wyświetl plik
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,14 +16,11 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
static nvmlReturn_t (*nvmlInternalShutdown)(void);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
// Used to make the NVML library calls thread safe
@@ -74,10 +71,7 @@ ncclResult_t wrapNvmlSymbols(void) {
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -91,9 +85,6 @@ teardown:
nvmlInternalShutdown = NULL;
nvmlInternalDeviceGetHandleByPciBusId = NULL;
nvmlInternalDeviceGetIndex = NULL;
nvmlInternalDeviceGetHandleByIndex = NULL;
nvmlInternalDeviceGetPciInfo = NULL;
nvmlInternalDeviceGetMinorNumber = NULL;
nvmlInternalDeviceGetNvLinkState = NULL;
nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -162,51 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
if (nvmlInternalDeviceGetHandleByIndex == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
if (nvmlInternalDeviceGetPciInfo == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetPciInfo() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
if (nvmlInternalDeviceGetMinorNumber == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
nvmlReturn_t ret;
NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetMinorNumber() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
if (nvmlInternalDeviceGetNvLinkState == NULL) {
/* Do not warn, this symbol is optional. */
+376 -162
Wyświetl plik
@@ -6,10 +6,10 @@
#include "comm.h"
#include "info.h"
#include "graph.h"
#include "collectives.h"
#define RECV 0
#define SEND 1
enum { proxyRecv=0, proxySend=1 };
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -19,15 +19,13 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
int index = pattern == ncclPatternPipelineFrom ?
/* no recv / no send if root = */
/* bcast */ (type == RECV ? myrank : nextrank ):
/* reduce */ (type == RECV ? prevrank : myrank );
/* bcast */ (type == proxyRecv ? myrank : nextrank ):
/* reduce */ (type == proxyRecv ? prevrank : myrank );
int rank = ring->userRanks[index];
return (root != rank);
}
enum { proxyRecv=0, proxySend=1 };
#define PROXYARGS_ALLOCATE_SIZE 32
#define PROXYARGS_ALLOCATE_SIZE 128
struct ncclProxyPool {
struct ncclProxyPool *next;
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
@@ -36,7 +34,7 @@ struct ncclProxyPool {
static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs* elem;
pthread_mutex_lock(&state->mutex);
pthread_mutex_lock(&state->poolMutex);
if (state->pool == NULL) {
// Allocate a new pool of elements
struct ncclProxyPool* newPool;
@@ -54,39 +52,113 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
}
elem = state->pool;
state->pool = state->pool->next;
pthread_mutex_unlock(&state->mutex);
elem->next = elem->nextPeer = NULL;
pthread_mutex_unlock(&state->poolMutex);
elem->next = elem->nextPeer = elem->nextGroup = NULL;
*argsptr = elem;
return ncclSuccess;
}
static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
struct ncclComm* comm = connector->comm;
struct ncclProxyState* state = &comm->proxyState;
pthread_mutex_lock(&state->mutex);
if (connector->proxyAppend == NULL) {
// Nothing running for that peer. Add to the circular list
if (state->ops == NULL) {
// Create the list
args->next = args;
state->ops = args;
} else {
// Insert element in the list
args->next = state->ops->next;
state->ops->next = args;
//#define DEBUG_PROXY 1
#ifdef DEBUG_PROXY
#define DEBUG_PROXY_PRINT printf
#else
#define DEBUG_PROXY_PRINT(...)
#endif
#define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
#define OP_SEEN 0x100000
ncclResult_t dumpProxyState(struct ncclProxyState* state) {
#ifdef DEBUG_PROXY
struct ncclProxyArgs* op = state->ops;
while (op) {
if (op->idle & OP_SEEN) {
WARN("Active list loop at element %ld\n", OP_INDEX(op));
}
op->idle |= OP_SEEN;
printf("[%ld]", OP_INDEX(op));
if (op->nextPeer) {
printf("(%ld)", OP_INDEX(op->nextPeer));
struct ncclProxyArgs* n = op->nextPeer;
n->idle |= OP_SEEN;
while (n->nextGroup || n->nextPeer) {
n = n->nextGroup ? n->nextGroup : n->nextPeer;
n->idle |= OP_SEEN;
}
}
if (op->nextGroup) {
printf("--G->");
op = op->nextGroup;
} else {
printf("--N->");
op = op->next;
}
connector->proxyAppend = args;
} else {
// There is an active operation already for that peer.
// Add it to the per-peer list
connector->proxyAppend->nextPeer = args;
connector->proxyAppend = args;
}
pthread_mutex_unlock(&state->mutex);
printf("[X]\n");
struct ncclProxyArgs* free = state->pool;
while (free) {
if (free->idle & OP_SEEN) {
WARN("Free list loop at element %ld\n", OP_INDEX(free));
}
free->idle |= OP_SEEN;
free = free->next;
}
struct ncclProxyPool* p = state->pools;
int i = 0;
while (p) {
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
if ((p->elems[e].idle & OP_SEEN) == 0) {
WARN("Element %d of pool %d has been lost\n", e, i);
struct ncclProxyArgs* free = state->pool;
printf("Free list ");
while (free) {
printf("--> %ld ", OP_INDEX(free));
free = free->next;
}
printf("\n");
return ncclInternalError;
}
p->elems[e].idle -= OP_SEEN;
}
p = p->next;
i++;
}
#endif
return ncclSuccess;
}
template <int type>
static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) {
struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
if (proxyAppend) {
if (shared && proxyAppend->opCount == args->opCount) {
args->next = proxyAppend->next;
proxyAppend->next = NULL;
proxyAppend->nextGroup = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next));
} else {
proxyAppend->nextPeer = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
}
} else {
// Nothing running for that peer. Add to the list
if (state->ops == NULL) {
// Create the list
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element : \n", OP_INDEX(args), shared, args->opCount);
state->ops = args;
} else {
// Append element at the end of the list
struct ncclProxyArgs* last = state->ops;
while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next;
last->next = args;
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element : \n", OP_INDEX(args),shared, args->opCount);
}
}
*(args->proxyAppendPtr) = args;
return ncclSuccess;
}
static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
if (peer < 0) return ncclSuccess;
struct ncclPeer* peerComm = args->channel->peers+peer;
@@ -98,107 +170,168 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
}
if (connector->transportComm->proxy == NULL) return ncclSuccess;
struct ncclProxyState* state = &connector->comm->proxyState;
struct ncclProxyArgs* op;
NCCLCHECK(allocateArgs(connector->comm, &op));
memcpy(op, args, sizeof(struct ncclProxyArgs));
op->connector = connector;
op->progress = connector->transportComm->proxy;
op->state = ncclProxyOpReady;
ProxyAppend(connector, op);
op->proxyAppendPtr =
connector->conn.shared ?
state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers
&connector->proxyAppend; // Dedicated buffers
if (state->nextOps == NULL) state->nextOps = op;
else state->nextOpsEnd->next = op;
state->nextOpsEnd = op;
return ncclSuccess;
}
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
struct ncclRing* ring = &args->channel->ring;
if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args));
if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args));
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
struct ncclTree* tree = &args->channel->treeUp;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
struct ncclTree* tree = &args->channel->tree;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args));
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
struct ncclTree* tree = &args->channel->treeDn;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
struct ncclTree* tree = &args->channel->tree;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args));
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
}
if (pattern == ncclPatternCollTreeUp) {
// CollTree up
struct ncclTree* tree = &args->channel->collTreeUp;
NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
struct ncclTree* tree = &args->channel->collTree;
NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args));
NCCLCHECK(SaveProxy(proxySend, tree->up, args));
}
if (pattern == ncclPatternCollTreeDown) {
// CollTree down
struct ncclTree* tree = &args->channel->collTreeDn;
NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
struct ncclTree* tree = &args->channel->collTree;
NCCLCHECK(SaveProxy(proxySend, tree->down[0], args));
NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
}
return ncclSuccess;
}
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) {
struct ncclProxyArgs args;
memset(&args, 0, sizeof(struct ncclProxyArgs));
args.channel = channel;
args.sliceSteps = 1;
args.chunkSteps = 1;
args.protocol = NCCL_PROTO_SIMPLE;
args.opCount = info->comm->opCount;
args.segment = segment;
args.opCount = channel->workFifoTail-1;
args.dtype = info->datatype;
if (info->delta > 0 && info->sendbytes >= 0) {
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
if (args.nsteps == 0) args.nsteps = 1;
NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
}
if (info->delta > 0 && info->recvbytes >= 0) {
int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
if (args.nsteps == 0) args.nsteps = 1;
NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
args.recvbytes = info->recvbytes;
args.sendbytes = 0;
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, &args));
}
if (info->delta > 0 && info->sendbytes >= 0) {
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
if (args.nsteps == 0) args.nsteps = 1;
args.sendbytes = info->sendbytes;
args.recvbytes = 0;
NCCLCHECK(SaveProxy(proxySend, peersend, &args));
}
return ncclSuccess;
}
ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info) {
const int peersPerChan = DIVUP(info->comm->nRanks, info->nChannels);
const int chunkSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS*info->chunkSteps;
const int loopSize = (info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1)*info->nchunksPerLoop*chunkSize;
for (int p=0; p<peersPerChan; p++) {
if ((peersPerChan == 1 && args->channel->id >= (info->nChannels/info->comm->nRanks)*info->comm->nRanks) ||
(peersPerChan > 1 && args->channel->id*peersPerChan+p >= info->comm->nRanks))
continue;
// first channel is reserved for self copy
if ((args->channel->id*peersPerChan+p)%info->comm->nRanks == 0)
continue;
int peerSend = (info->comm->rank+(args->channel->id*peersPerChan)+p)%info->comm->nRanks;
int peerRecv = (2*info->comm->nRanks+info->comm->rank-(args->channel->id*peersPerChan)%info->comm->nRanks-p%info->comm->nRanks)%info->comm->nRanks;
if (info->coll == ncclCollAllToAll || (info->coll == ncclCollScatter && info->comm->rank == info->root) ||
(info->coll == ncclCollGather && peerSend == info->root))
NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
if (info->coll == ncclCollAllToAll || (info->coll == ncclCollGather && info->comm->rank == info->root) ||
(info->coll == ncclCollScatter && peerRecv == info->root))
NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
if (info->coll == ncclCollAllToAllv) {
info->nBytes = info->sendcounts[peerSend]*info->count;
int nLoops = (int)(DIVUP(info->nBytes, loopSize));
args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
TRACE(NCCL_NET,"peerSend %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
peerSend, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, args->nsteps, info->comm);
NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
info->nBytes = info->recvcounts[peerRecv]*info->count;
nLoops = (int)(DIVUP(info->nBytes, loopSize));
args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
TRACE(NCCL_NET,"peerRecv %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
peerRecv, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, args->nsteps, info->comm);
NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) {
struct ncclProxyArgs* freeOp = *opPtr;
DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup));
if (*prevGroupPtr && *prevOpPtr) return ncclInternalError;
if (freeOp->nextGroup) {
// Part of a group : remove the element
struct ncclProxyArgs* next = freeOp->nextGroup;
*opPtr = next;
if (*prevGroupPtr) {
(*prevGroupPtr)->nextGroup = next;
} else if (*prevOpPtr) {
(*prevOpPtr)->next = next;
} else {
state->ops = next;
}
} else {
struct ncclProxyArgs* next = freeOp->next;
*opPtr = next;
if ((*prevGroupPtr)) {
(*prevGroupPtr)->next = next;
(*prevGroupPtr)->nextGroup = NULL;
(*prevGroupPtr)->nextPeer = freeOp->nextPeer;
if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr;
(*prevOpPtr) = *prevGroupPtr;
(*prevGroupPtr) = NULL;
} else {
if (freeOp->nextPeer) {
// replace op by nextPeer
struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
if (*prevOpPtr) {
(*prevOpPtr)->next = nextPeer;
} else {
state->ops = nextPeer;
}
struct ncclProxyArgs* lastGroup = nextPeer;
while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup;
lastGroup->next = next;
*(prevOpPtr) = lastGroup;
} else {
*(freeOp->proxyAppendPtr) = NULL;
if (*prevOpPtr) {
(*prevOpPtr)->next = next;
} else {
state->ops = next;
}
}
}
}
pthread_mutex_lock(&state->poolMutex);
freeOp->next = state->pool;
state->pool = freeOp;
pthread_mutex_unlock(&state->poolMutex);
DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
NCCLCHECK(dumpProxyState(state));
return ncclSuccess;
}
static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
struct ncclProxyArgs* prevOp = NULL;
struct ncclProxyArgs* prevGroup = NULL;
struct ncclProxyArgs* op = *opsPtr;
while (op) {
if (op->state == ncclProxyOpNone) return ncclInternalError;
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
// yet and might be cancelled before they even start. Hold on on those.
if (op->opCount < comm->lastOpCount) {
NCCLCHECK(op->progress(op));
*idle &= op->idle;
}
if (op->state == ncclProxyOpNone) {
NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup));
} else {
if (op->nextGroup) {
prevGroup = op;
prevOp = NULL;
op = op->nextGroup;
} else {
prevOp = op;
prevGroup = NULL;
op = op->next;
}
}
}
return ncclSuccess;
@@ -207,91 +340,170 @@ ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info)
void* persistentThread(void *comm_) {
struct ncclComm* comm = (struct ncclComm*)comm_;
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs* op = NULL;
ncclResult_t ret = ncclSuccess;
int idle = 1;
int idleSpin = 0;
char threadName[16];
sprintf(threadName, "NCCLproxy %5d", comm->rank);
nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
pthread_mutex_lock(&state->opsMutex);
struct ncclProxyArgs** opsPtr = &state->ops;
while (1) {
do {
if (*comm->abortFlag) return NULL;
if (op == NULL) {
pthread_mutex_lock(&state->mutex);
op = state->ops;
if (op == NULL) {
if (state->stop) {
// No more commands to process and proxy has been requested to stop
pthread_mutex_unlock(&state->mutex);
return NULL;
}
pthread_cond_wait(&state->cond, &state->mutex);
}
pthread_mutex_unlock(&state->mutex);
if (LOAD(comm->abortFlag)) {
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
while (LOAD(opsPtr) == NULL) {
if (state->stop) {
// No more commands to process and proxy has been requested to stop
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
} while (op == NULL);
op->idle = 0;
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
// yet and might be cancelled before they even start. Hold on on those.
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
pthread_cond_wait(&state->cond, &state->opsMutex);
}
int idle = 1;
ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
pthread_mutex_unlock(&state->opsMutex);
return NULL;
}
idle &= op->idle;
pthread_mutex_lock(&state->mutex);
if (!idle) idleSpin = 0;
struct ncclProxyArgs *next = op->next;
if (next->state == ncclProxyOpNone) {
struct ncclProxyArgs *freeOp = next;
if (next->nextPeer) {
// Replace next by its next per-peer element.
next = next->nextPeer;
if (op != freeOp) {
next->next = freeOp->next;
op->next = next;
} else {
next->next = next;
}
} else {
// Remove next from circular list
next->connector->proxyAppend = NULL;
if (op != freeOp) {
next = next->next;
op->next = next;
} else {
next = NULL;
}
}
if (freeOp == state->ops) state->ops = next;
freeOp->next = state->pool;
state->pool = freeOp;
if (idle) {
pthread_mutex_unlock(&state->opsMutex);
sched_yield(); // No request progressed. Let others run.
pthread_mutex_lock(&state->opsMutex);
}
op = next;
if (op == state->ops) {
if (idle == 1) {
if (++idleSpin == 10) {
sched_yield();
idleSpin = 0;
}
}
idle = 1;
}
pthread_mutex_unlock(&state->mutex);
}
}
ncclResult_t ncclProxyStart(struct ncclComm* comm) {
pthread_mutex_lock(&comm->proxyState.mutex);
if (comm->proxyState.ops != NULL)
pthread_cond_signal(&comm->proxyState.cond);
pthread_mutex_unlock(&comm->proxyState.mutex);
struct ncclProxyState* state = &comm->proxyState;
pthread_mutex_lock(&state->opsMutex);
// Sort operations as we append them : collectives and
// receives first, then sends.
ncclProxyArgs* next, *prev = NULL, *op = state->nextOps;
while (op) {
next = op->next;
if (op->sendbytes) {
if (prev) prev->next = next;
else state->nextOps = next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
} else prev = op;
op = next;
}
op = state->nextOps;
while (op) {
next = op->next;
op->next = NULL;
NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
op = next;
}
state->nextOps = state->nextOpsEnd = NULL;
NCCLCHECK(dumpProxyState(state));
if (state->ops != NULL)
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->opsMutex);
return ncclSuccess;
}
NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
if (state == NULL) {
NCCLCHECK(ncclCalloc(&state, 1));
comm->proxyState.sharedBuffs = state;
state->nslots = ncclParamProxySharedBuffersCount();
if (state->nslots == -2) {
state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS;
}
state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
}
char* buff;
int* used;
*size = 2*comm->p2pnChannels*state->slotSize*state->nslots;
if (cuda && state->cudaBuff[0] == NULL) {
NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda));
NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
for (int i=0; i<2*comm->p2pnChannels; i++) {
state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
state->cudaUsed[i] = used + state->nslots*i;
}
} else if (state->hostBuff[0] == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
for (int i=0; i<2*comm->p2pnChannels; i++) {
state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
state->hostUsed[i] = used + state->nslots*i;
}
}
buff = cuda ? state->cudaBuff[0] : state->hostBuff[0];
*ptr = buff;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
// Use different pools for different channels and also separate send/recv.
int p = 2*channel+type;
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
if (buff == NULL) return ncclInternalError;
int nslots = 1;
while (nslots*state->slotSize < size) nslots *= 2;
for (int s=0; s<state->nslots; s+=nslots) {
int u = 0;
for (int i=0; i<nslots; i++) u += used[s+i];
if (u == 0) {
for (int i=0; i<nslots; i++) used[s+i] = 1;
*ptr = buff+state->slotSize*s;
return ncclSuccess;
}
}
*ptr = NULL;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
int p = 2*channel+type;
int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
if (buff == NULL) return ncclInternalError;
int nslots = 1;
while (nslots*state->slotSize < size) nslots *= 2;
int s = (ptr-buff)/state->slotSize;
if (s < 0 || s+nslots > state->nslots) {
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
return ncclInternalError;
}
for (int i=0; i<nslots; i++) used[s+i] = 0;
return ncclSuccess;
}
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
if (state) {
CUDACHECK(hipFree(state->cudaBuff[0]));
free(state->cudaUsed[0]);
NCCLCHECK(ncclCudaHostFree(state->hostBuff[0]));
free(state->hostUsed[0]);
free(state);
}
return ncclSuccess;
}
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
if (!comm->proxyThread) {
comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
comm->proxyState.ops = NULL;
pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
}
@@ -302,21 +514,23 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
struct ncclProxyState* state = &comm->proxyState;
// Request the proxy to stop and then wake it
pthread_mutex_lock(&state->mutex);
pthread_mutex_lock(&state->opsMutex);
state->stop = true;
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->mutex);
pthread_mutex_unlock(&state->opsMutex);
if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
// Free off any memory allocated for the proxy arg pools
pthread_mutex_lock(&state->mutex);
pthread_mutex_lock(&state->poolMutex);
struct ncclProxyState* proxyState = &comm->proxyState;
while (proxyState->pools != NULL) {
struct ncclProxyPool *next = proxyState->pools->next;
free(proxyState->pools);
proxyState->pools = next;
}
pthread_mutex_unlock(&state->mutex);
pthread_mutex_unlock(&state->poolMutex);
NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
return ncclSuccess;
}
+77 -42
Wyświetl plik
@@ -20,15 +20,15 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
};
template <int type>
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
return ncclSuccess;
}
}
@@ -36,51 +36,86 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
return ncclInternalError;
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
struct ncclConnect connect;
struct ncclConnector* conn;
uint32_t mask = 1 << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) { ++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
comm->connectRecv[peer] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].send;
if (conn->connected) { ++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
comm->connectSend[peer] |= mask;
}
return ncclSuccess;
}
void dumpData(struct ncclConnect* data, int ndata) {
for (int n=0; n<ndata; n++) {
printf("[%d] ", n);
uint8_t* d = (uint8_t*)data;
for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
printf("\n");
}
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
struct ncclConnect data[2*MAXCHANNELS];
for (int i=1; i<comm->nRanks; i++) {
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer];
uint32_t sendMask = comm->connectSend[sendPeer];
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
}
}
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
}
}
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
sendData = data;
recvData = data+sendChannels;
}
} else {
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
}
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
}
}
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
}
}
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].send;
if (conn->connected) {++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
}
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) {++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
}
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
return ncclSuccess;
}
+154 -151
Wyświetl plik
@@ -27,10 +27,8 @@ struct reqSlot {
struct collNetSendResources {
void* collNetSendComm;
struct ncclSendMem* hostSendMem;
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
uint32_t* llData;
int netDev;
int useGdr;
@@ -46,10 +44,8 @@ struct collNetSendResources {
struct collNetRecvResources {
void* netListenComm;
void* collNetRecvComm;
struct ncclSendMem* hostSendMem;
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
uint32_t* llData;
int netDev;
int useGdr;
@@ -68,16 +64,15 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
}
/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct collNetSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
resources->devHostSendMem = resources->hostSendMem;
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
int recvSize = offsetof(struct ncclRecvMem, buff);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
@@ -85,8 +80,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
resources->devHostRecvMem = resources->hostRecvMem;
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
@@ -95,16 +89,15 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
}
/* Setup recv connector */
ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
struct collNetRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
resources->devHostSendMem = resources->hostSendMem;
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
int recvSize = offsetof(struct ncclRecvMem, buff);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
@@ -112,8 +105,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
resources->devHostRecvMem = resources->hostRecvMem;
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
@@ -124,25 +116,25 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
return ncclSuccess;
}
ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
offset += send->comm->buffSizes[p];
}
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->devHostRecvMem->tail;
send->conn.fifo = resources->devHostRecvMem->sizesFifo;
send->conn.head = &resources->devHostSendMem->head;
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
send->conn.tail = &resources->recvMem->tail;
send->conn.sizesFifo = resources->recvMem->sizesFifo;
send->conn.head = &resources->sendMem->head;
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
// Get info from recv side
resources->collNetRank = rank;
@@ -160,24 +152,24 @@ ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, in
return ncclSuccess;
}
ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
resources->collNetRank = rank;
// Intermediate buffering on GPU for GPU Direct RDMA
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
offset += recv->comm->buffSizes[p];
}
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount are always on host
recv->conn.tail = &resources->devHostRecvMem->tail;
recv->conn.head = &resources->devHostSendMem->head;
recv->conn.tail = &resources->recvMem->tail;
recv->conn.head = &resources->sendMem->head;
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
@@ -214,8 +206,8 @@ cleanup:
ncclResult_t collNetSendFree(void* sendTransportResources) {
struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
if (resources->collNetSendComm) {
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
@@ -229,12 +221,12 @@ ncclResult_t collNetSendFree(void* sendTransportResources) {
ncclResult_t collNetRecvFree(void* recvTransportResources) {
struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
if (resources->collNetRecvComm) {
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
}
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
if (resources->useGdr)
CUDACHECK(hipFree(resources->devRecvMem));
free(resources->llData);
@@ -257,96 +249,84 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->head = resources->step;
args->tail = resources->step;
args->end = args->head + args->nsteps;
args->posted = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* sendMhandle = resources->sendMhandles[p];
void* recvMhandle = resources->recvMhandles[p];
args->idle = 1;
struct reqSlot* reqFifo = resources->reqFifo;
if (args->head < args->end) {
int buffSlot = args->tail%NCCL_STEPS;
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
&& reqFifo[buffSlot].recvBuff != NULL) {
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
int buffSlot = args->transmitted%NCCL_STEPS;
if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS
&& LOAD(&reqFifo[buffSlot].recvBuff) != NULL) {
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = LOAD(sizesFifo+buffSlot);
char* buff = localBuff+buffSlot*stepSize;
int ready = 1;
if (args->protocol == NCCL_PROTO_LL) {
int size = LOAD(sizesFifo+buffSlot);
if (size != -1) {
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
int ready = 1;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
}
if (ready) {
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
//separate data from flag
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *d1 = &lines[i].data1;
volatile uint32_t *d2 = &lines[i].data2;
sendBuff[2*i] = LOAD(d1);
sendBuff[2*i+1] = LOAD(d2);
}
int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
}
}
} else if (args->tail < LOAD(recvTail)) {
// Send through network
if (LOAD(sizesFifo+buffSlot) != -1) {
int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype);
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
// Pack data into another buffer
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
buff = (char*)sendBuff;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *d1 = &lines[i].data1;
volatile uint32_t *f2 = &lines[i].flag2;
volatile uint32_t *d2 = &lines[i].data2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
sendBuff[2*i] = LOAD(d1);
sendBuff[2*i+1] = LOAD(d2);
}
size = nFifoLines*2*sizeof(uint32_t);
}
}
if (args->head < args->tail) {
int done, size;
int buffSlot = args->head%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
reqFifo[buffSlot].size = size;
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
// (reordered store after store is possible on POWER, though not on x86)
__sync_synchronize();
reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
args->head += args->sliceSteps;
STORE(&resources->hostSendMem->head, args->head);
args->idle = 0;
if (ready) {
// Data is ready, try to send.
int count = size/ncclTypeSize(args->dtype);
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->transmitted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
}
}
}
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpNone;
// Check whether the network has completed some send operations.
if (args->done < args->transmitted) {
int done, size;
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
STORE(&reqFifo[buffSlot].size, size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
// (reordered store after store is possible on POWER, though not on x86)
__sync_synchronize();
STORE(&reqFifo[buffSlot].recvBuff, NULL); // Notify recvProxy
args->done += args->sliceSteps;
resources->sendMem->head = args->done;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
}
}
return ncclSuccess;
@@ -361,56 +341,79 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->head = resources->step;
args->tail = resources->step;
args->end = args->head + args->nsteps;
args->posted = args->received = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = resources->mhandles[p];
struct reqSlot* reqFifo = resources->reqFifo;
if (args->head < args->end) {
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
args->tail += args->sliceSteps;
args->idle = 0;
}
if (args->tail > args->head) {
int buffSlot = args->head%NCCL_STEPS;
if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
args->head += args->sliceSteps;
if (args->protocol == NCCL_PROTO_LL) { // ll
// re-attach flag
uint32_t flag = args->head;
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
for (int i=0; i<nFifoLines; i++) {
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
}
} else if (args->protocol == NCCL_PROTO_SIMPLE) {
if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
resources->hostRecvMem->tail = args->head;
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
int buffSlot = args->posted%NCCL_STEPS;
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
if (args->posted > args->received) {
int buffSlot = args->received%NCCL_STEPS;
if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
if (args->protocol == NCCL_PROTO_LL) { // ll
// re-attach flag
uint32_t flag = NCCL_LL_FLAG(args->received + 1);
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
int nFifoLines = DIVUP(LOAD(&reqFifo[buffSlot].size), 2*sizeof(uint32_t));
for (int i=0; i<nFifoLines; i++) {
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
}
args->idle = 0;
}
args->received += args->sliceSteps;
if (LOAD(&reqFifo[buffSlot].size) > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, LOAD(&reqFifo[buffSlot].size), mhandle, args->requests+buffSlot));
} else {
args->requests[buffSlot] = NULL;
}
args->idle = 0;
return ncclSuccess;
}
}
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpNone;
if (args->received > args->transmitted) {
// Progress flush operations
int buffSlot = args->transmitted%NCCL_STEPS;
int done = 1;
if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->transmitted += args->sliceSteps;
__sync_synchronize();
resources->recvMem->tail = args->transmitted;
args->idle = 0;
return ncclSuccess;
}
}
if (args->transmitted > args->done) {
volatile uint64_t* sendHead = &resources->sendMem->head;
uint64_t done = LOAD(sendHead);
while (done > args->done &&
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
args->transmitted > args->done) {
args->done += args->sliceSteps;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
}
}
}
return ncclSuccess;
+286 -218
Wyświetl plik
@@ -9,7 +9,7 @@
#include "net.h"
#include "graph.h"
#include <sys/time.h>
#include <numaif.h>
#include "collectives.h"
struct netConnectInfo {
ncclNetHandle_t netHandle;
@@ -25,6 +25,7 @@ struct netSendResources {
struct ncclRecvMem* recvMem;
int netDev;
int useGdr;
int shared;
char* buffers[LOC_COUNT];
int buffSizes[LOC_COUNT];
void* mhandles[LOC_COUNT];
@@ -40,6 +41,7 @@ struct netRecvResources {
struct ncclRecvMem* recvMem;
int netDev;
int useGdr;
int shared;
char* buffers[LOC_COUNT];
int buffSizes[LOC_COUNT];
void* mhandles[LOC_COUNT];
@@ -55,118 +57,118 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
return ncclSuccess;
}
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
send->conn.tail = &resources->recvMem->tail;
send->conn.fifo = resources->recvMem->sizesFifo;
send->conn.sizesFifo = resources->recvMem->sizesFifo;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
send->conn.head = &resources->sendMem->head;
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
int protoLoc[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
if (resources->shared == 0) {
int protoLoc[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
}
int buffSizes[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
buffSizes[p] = send->comm->buffSizes[p];
resources->buffSizes[protoLoc[p]] += buffSizes[p];
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
}
if (resources->buffSizes[LOC_HOSTMEM]) {
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
}
int offsets[LOC_COUNT];
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
offsets[protoLoc[p]] += buffSizes[p];
}
}
int buffSizes[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
// Only allocate buffers for simple for p2p connections
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
resources->buffSizes[protoLoc[p]] += buffSizes[p];
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
}
char line[16];
if (resources->buffSizes[LOC_HOSTMEM]) {
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
int status[1] = {-1};
line[0]= 0;
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
sprintf(line, "/MEM%d", status[0]);
}
int offsets[LOC_COUNT];
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
offsets[protoLoc[p]] += buffSizes[p];
}
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : line);
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
return ncclSuccess;
}
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
recv->conn.tail = &resources->recvMem->tail;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
recv->conn.head = &resources->sendMem->head;
int protoLoc[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
int protoLoc[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
}
int buffSizes[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
buffSizes[p] = recv->comm->buffSizes[p];
resources->buffSizes[protoLoc[p]] += buffSizes[p];
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
}
if (resources->buffSizes[LOC_HOSTMEM]) {
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
}
int offsets[LOC_COUNT];
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
offsets[protoLoc[p]] += buffSizes[p];
}
}
int buffSizes[NCCL_NUM_PROTOCOLS];
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
// Only allocate buffers for simple for p2p connections
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
resources->buffSizes[protoLoc[p]] += buffSizes[p];
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
}
char line[16];
if (resources->buffSizes[LOC_HOSTMEM]) {
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
int status[1] = {-1};
line[0]= 0;
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
sprintf(line, "/MEM%d", status[0]);
}
int offsets[LOC_COUNT];
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
offsets[protoLoc[p]] += buffSizes[p];
}
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : line);
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
return ncclSuccess;
}
ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
@@ -174,6 +176,13 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
// Connect to remote peer
NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
if (resources->shared) {
// Get shared buffers
int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
}
@@ -184,7 +193,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
}
/* Connect to this peer */
ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
@@ -192,6 +201,13 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
if (resources->shared) {
// Get shared buffers
int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
}
if (resources->buffSizes[LOC_DEVMEM]) {
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
}
@@ -209,8 +225,10 @@ ncclResult_t netSendFree(void* transportResources) {
if (resources->buffers[l])
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
}
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
if (resources->shared == 0) {
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
}
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
return ncclSuccess;
@@ -224,138 +242,144 @@ ncclResult_t netRecvFree(void* transportResources) {
if (resources->buffers[l])
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
}
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
if (resources->shared == 0) {
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
}
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
free(resources);
return ncclSuccess;
}
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->head = resources->step;
args->tail = resources->step;
args->end = args->head + args->nsteps;
args->posted = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = *(resources->mhandlesProto[p]);
args->idle = 1;
if (args->head < args->end) {
int buffSlot = args->tail%NCCL_STEPS;
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (args->sendbytes < buffSize) buffSize = args->sendbytes;
// Post buffers to the GPU
if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) {
if (resources->shared) {
char* ptr;
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr));
if (ptr == NULL) return ncclInternalError;
resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr;
__sync_synchronize();
volatile uint64_t* sendHead = &resources->sendMem->head;
args->posted += args->sliceSteps;
STORE(sendHead, args->posted - NCCL_STEPS);
} else args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
// Check whether we received data from the GPU and send it to the network
int buffSlot = args->transmitted%NCCL_STEPS;
if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) {
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = LOAD(sizesFifo+buffSlot);
char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
int ready = 1;
if (args->protocol == NCCL_PROTO_LL128) {
if (args->tail < LOAD(recvTail)) {
if (LOAD(sizesFifo+buffSlot) != -1) {
int ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
// called threadfence()
uint64_t flag = args->tail + 1;
int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
ready = 1;
for (int i=0; i<nFifoLines; i++) {
if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
}
}
if (ready) {
// Send through network
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
}
int ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
// called threadfence()
uint64_t flag = args->transmitted + 1;
int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
volatile uint64_t* lines = (volatile uint64_t*)buff;
ready = 1;
for (int i=0; i<nFifoLines; i++) {
if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
}
}
} else if (args->protocol == NCCL_PROTO_LL) {
int size = LOAD(sizesFifo+buffSlot);
if (size != -1) {
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
size = nFifoLines * sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
int ready = 1;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
}
if (ready) {
NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
}
}
} else if (args->tail < LOAD(recvTail)) {
// Send through network
if (LOAD(sizesFifo+buffSlot) != -1) {
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
#ifdef ENABLE_PROFILING
if (args->channel->active_req == 0) {
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
args->channel->active_req ++;
args->channel->sizes += LOAD(sizesFifo+buffSlot);
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
#endif
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
args->idle = 0;
}
uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
}
}
}
if (args->head < args->tail) {
int done;
int buffSlot = args->head%NCCL_STEPS;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
if (ready) {
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
if (args->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
if (delta) {
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
args->channel->bw_count ++;
}
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
}
args->channel->active_req ++;
args->channel->sizes += LOAD(sizesFifo+buffSlot);
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
#endif
args->head += args->sliceSteps;
STORE(&resources->sendMem->head, args->head);
args->idle = 0;
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->transmitted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
}
}
}
}
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpNone;
// Check whether the network has completed some send operations.
if (args->done < args->transmitted) {
int done;
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
if (args->channel->active_req == 0) {
struct timeval tv;
gettimeofday(&tv, NULL);
float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
if (delta) {
args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
args->channel->bw_count ++;
}
}
}
#endif
if (resources->shared) {
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr));
}
args->done += args->sliceSteps;
if (resources->shared == 0) {
resources->sendMem->head = args->done;
}
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
}
}
return ncclSuccess;
@@ -366,45 +390,57 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
args->head = resources->step;
args->tail = resources->step;
args->end = args->head + args->nsteps;
args->posted = args->received = args->transmitted = args->done = resources->step;
args->end = resources->step + args->nsteps;
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
char* localBuff = args->connector->conn.buffs[p];
void* mhandle = *(resources->mhandlesProto[p]);
if (args->head < args->end) {
volatile uint64_t* sendHead = &resources->sendMem->head;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (args->channel->active_req == 0) {
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
args->channel->active_req ++;
}
#endif
args->tail += args->sliceSteps;
args->idle = 0;
}
int buffSize = stepSize*args->sliceSteps;
if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
if (args->recvbytes < buffSize) buffSize = args->recvbytes;
if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
int buffSlot = args->posted%NCCL_STEPS;
char* ptr;
if (resources->shared) {
NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr));
if (ptr == NULL) return ncclInternalError;
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
STORE(ptrsFifo+buffSlot, ptr);
} else {
ptr = localBuff+buffSlot*stepSize;
}
if (args->tail > args->head) {
int buffSlot = args->head%NCCL_STEPS;
int done, size;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
if (done) {
args->head += args->sliceSteps;
if (args->protocol == NCCL_PROTO_SIMPLE) {
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (args->channel->active_req == 0) {
gettimeofday(&args->channel->tvs, NULL);
args->channel->sizes = 0;
}
args->channel->active_req ++;
}
#endif
args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
} else if (resources->shared) {
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
}
}
if (args->posted > args->received) {
int buffSlot = args->received%NCCL_STEPS;
int done, size;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
if (done) {
args->received += args->sliceSteps;
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
args->channel->sizes += size;
args->channel->recv_byte += size;
@@ -417,18 +453,50 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
args->channel->bw_count ++;
}
}
#endif
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
STORE(&resources->recvMem->tail, args->head);
}
args->idle = 0;
}
#endif
if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
// Don't pass data to the GPU yet, flush first.
volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot));
} else {
args->requests[buffSlot] = NULL;
}
args->idle = 0;
return ncclSuccess;
}
}
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpNone;
if (args->received > args->transmitted) {
// Progress flush operations
int buffSlot = args->transmitted%NCCL_STEPS;
int done = 1;
if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->transmitted += args->sliceSteps;
__sync_synchronize();
resources->recvMem->tail = args->transmitted;
args->idle = 0;
return ncclSuccess;
}
}
if (args->transmitted > args->done) {
volatile uint64_t* sendHead = &resources->sendMem->head;
uint64_t done = LOAD(sendHead);
while (done > args->done &&
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
args->transmitted > args->done) {
if (resources->shared) {
char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
}
args->done += args->sliceSteps;
args->idle = 0;
if (args->done == args->end) {
resources->step = args->end;
args->state = ncclProxyOpNone;
}
}
}
}
return ncclSuccess;
+86 -69
Wyświetl plik
@@ -25,9 +25,8 @@
#include "ibvwrap.h"
#define USE_RDMA_WRITE 1
#define USE_RDMA_SEND_INLINE 0
#define MAXNAMESIZE 64
static char ncclIbIfName[MAX_IF_NAME_SIZE];
static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
static union socketAddress ncclIbIfAddr;
static int ncclNIbDevs = -1;
@@ -58,6 +57,8 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
NCCL_PARAM(IbSl, "IB_SL", 0);
NCCL_PARAM(IbTc, "IB_TC", 0);
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
@@ -200,7 +201,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
}
line[1023] = '\0';
char addrline[1024];
char addrline[SOCKET_NAME_MAXLEN+1];
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
}
pthread_mutex_unlock(&ncclIbLock);
@@ -251,7 +252,7 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
return ncclSuccess;
}
#define MAX_REQUESTS 128
#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
struct ncclIbQpInfo {
uint32_t lid;
@@ -272,18 +273,19 @@ struct ncclIbHandle {
union socketAddress connectAddr;
};
struct ncclIbVerbs {
struct ibv_pd* pd;
struct ibv_cq* cq;
};
struct ncclIbRequest {
int used;
int type;
struct ncclIbVerbs* verbs;
int done;
int events;
int size;
int free;
};
struct ncclIbVerbs {
struct ibv_pd* pd;
struct ibv_cq* cq;
uint64_t pad[2];
struct ncclIbRequest reqs[MAX_REQUESTS];
};
struct ncclIbListenComm {
@@ -297,18 +299,23 @@ struct alignas(64) ncclIbSendFifo {
uint32_t seq;
uint32_t rkey;
uint32_t ready;
uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
};
struct ncclIbSendComm {
struct ncclIbVerbs verbs;
struct ncclIbSendFifo fifo[MAX_REQUESTS];
struct ncclIbRequest reqs[MAX_REQUESTS];
uint32_t fifoHead;
int fd;
int ready;
struct ibv_qp* qp;
struct ibv_mr* fifoMr;
};
// The SendFifo needs to be 32-byte aligned and each element needs
// to be a 32-byte multiple, so that an entry does not get split and
// written out of order when IB Relaxed Ordering is enabled
static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
struct ncclIbGpuFlush {
int enabled;
@@ -331,16 +338,17 @@ struct ncclIbRemFifo {
struct ncclIbRecvComm {
struct ncclIbVerbs verbs;
struct ncclIbRemFifo remFifo;
struct ncclIbRequest reqs[MAX_REQUESTS];
int fd;
int ready;
struct ibv_qp* qp;
struct ncclIbGpuFlush gpuFlush;
};
static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
// Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS, NULL, NULL, 0));
return ncclSuccess;
}
@@ -356,17 +364,17 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
qpInitAttr.send_cq = verbs->cq;
qpInitAttr.recv_cq = verbs->cq;
qpInitAttr.qp_type = IBV_QPT_RC;
// We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
// We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
qpInitAttr.cap.max_send_sge = 1;
qpInitAttr.cap.max_recv_sge = 1;
qpInitAttr.cap.max_inline_data = 0;
qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_INIT;
qpAttr.pkey_index = 0;
qpAttr.pkey_index = ncclParamIbPkey();
qpAttr.port_num = ib_port;
qpAttr.qp_access_flags = access_flags;
NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
@@ -481,7 +489,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
socklen_t socklen = sizeof(struct sockaddr_in);
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
struct ncclIbQpInfo remQpInfo;
NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
NCCLCHECK(socketRecv(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
// IB setup
ibv_context* ctx = ncclIbDevs[lComm->dev].context;
@@ -509,14 +517,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
#if USE_RDMA_SEND_INLINE
// Determine whether the remFifo element data can be sent INLINE
struct ibv_qp_attr attr;
struct ibv_qp_init_attr init_attr;
NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
#endif
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
// Allocate Flush dummy buffer for GPU Direct RDMA
rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
@@ -553,16 +554,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
return ncclSuccess;
}
ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
for (int i=0; i<MAX_REQUESTS; i++) {
struct ncclIbRequest* r = reqs+i;
struct ncclIbRequest* r = verbs->reqs+i;
if (r->used == 0) {
r->used = 1;
r->type = 0;
r->verbs = NULL;
r->done = 0;
r->verbs = verbs;
r->events = 1;
r->size = -1;
r->free = 0;
*req = r;
return ncclSuccess;
}
@@ -571,6 +571,10 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
*req = NULL;
return ncclInternalError;
}
ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
r->used = 0;
return ncclSuccess;
}
ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
struct ncclIbQpInfo remQpInfo;
@@ -585,7 +589,6 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
NCCLCHECK(ncclIbRtsQp(qp));
comm->ready = 1;
// Block until this is done. It *should* not block indefinitely.
NCCLCHECK(socketSend(comm->fd, &comm->ready, sizeof(int)));
@@ -606,6 +609,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
#define REG_ALIGN (4096)
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
uint64_t addr = (uint64_t)data;
assert(size > 0);
@@ -639,8 +643,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
req->verbs = &comm->verbs;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->size = size;
struct ibv_send_wr wr;
@@ -656,14 +659,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
wr.sg_list = &sge;
wr.num_sge = 1;
}
#if USE_RDMA_WRITE == 0
wr.opcode = IBV_WR_SEND;
wr.send_flags = IBV_SEND_SIGNALED;
int useAr = 0;
if (size > ncclParamIbArThreshold()) {
useAr = 1;
}
#if USE_RDMA_WRITE
#else
__sync_synchronize(); // order the readyPtr load against rkey load below
// Sanity checks to catch user collective call count/size mismatches
// plus any potential programming errors
@@ -672,7 +671,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
return ncclInternalError;
}
int useAr = 0;
if (size > ncclParamIbArThreshold()) {
useAr = 1;
}
wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED;
wr.wr.rdma.remote_addr = LOAD(&slot->addr);
wr.wr.rdma.rkey = LOAD(&slot->rkey);
wr.imm_data = size; // Send the message size via imm_data
@@ -696,7 +700,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr.sg_list = NULL;
wr.num_sge = 0;
wr.send_flags &= ~IBV_SEND_SIGNALED;
wr.send_flags |= IBV_SEND_SIGNALED;
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
}
#endif
@@ -704,28 +708,51 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
return ncclSuccess;
}
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
req->verbs = &comm->verbs;
req->free = 1; // Not a user req ; free as soon as it is complete.
wr.wr_id = (uint64_t)req;
struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
int slot = comm->remFifo.tail%MAX_REQUESTS;
struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
localElem->addr = addr;
localElem->rkey = rkey;
localElem->ready = 1;
localElem->size = size; // Sanity/Debugging
localElem->seq = comm->remFifo.tail; // Sanity/Debugging
wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
wr.wr.rdma.rkey = comm->remFifo.rkey;
comm->remFifo.sge.addr = (uint64_t)localElem;
wr.sg_list = &comm->remFifo.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE
// We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise
// the send queue will never empty.
//
// From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/
// "How to use Unsignaled Completion?" / "Gotchas and Pitfalls"
// All posted Send Requested, Signaled and Unsignaled, are considered outstanding until
// a Work Completion that they, or Send Requests that were posted after them, was polled
// from the Completion Queue associated with the Send Queue. This means if one works with
// a Queue Pair that was configured to work with Unsignaled Completions, he must make
// sure that occasionally (before the Send Queue is full with outstanding Send Requests)
// a Send Request that generate Work Completion will be posted.
//
// Not following this rule may lead to a case that the Send Queue is full with Send
// Requests that won't generate Work Completion:
//
// - The Send Queue is full, so no new Send Requests can be posted to it
// - The Send Queue can't be emptied, since no Work Completion can be generated anymore
// (the reason is that no Work Completion, that can generate Work Completion that
// polling it will empty the Send Queue, can be posted)
// - The status of all posted Send Request is considered unknown
//
if (slot == 0) {
wr.send_flags |= IBV_SEND_SIGNALED;
wr.wr_id = (uint64_t)req;
req->events++;
}
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
@@ -742,8 +769,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
req->verbs = &comm->verbs;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->size = size;
struct ibv_recv_wr wr;
@@ -765,17 +791,16 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
*request = req;
// Post to FIFO to notify sender
NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
return ncclSuccess;
}
ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
req->verbs = &comm->verbs;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
struct ibv_send_wr wr;
@@ -792,11 +817,7 @@ ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
int done = 0;
while (done == 0) {
NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
}
*request = req;
return ncclSuccess;
}
@@ -805,10 +826,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
*done = 0;
while (1) {
if (r->done == 1) {
if (r->events == 0) {
*done = 1;
if (size) *size = r->size;
r->used = 0;
NCCLCHECK(ncclIbFreeRequest(r));
return ncclSuccess;
}
@@ -833,11 +854,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
doneReq->size = wc->imm_data;
#endif
}
doneReq->done = 1;
if (doneReq->free == 1) {
// This is an internal (FIFO post) req. Free it immediately.
doneReq->used = 0;
}
doneReq->events--;
}
}
}
@@ -892,7 +909,7 @@ ncclNet_t ncclNetIb = {
ncclIbDeregMr,
ncclIbIsend,
ncclIbIrecv,
ncclIbFlush,
ncclIbIflush,
ncclIbTest,
ncclIbCloseSend,
ncclIbCloseRecv,
+18 -11
Wyświetl plik
@@ -49,17 +49,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
WARN("NET/Socket : no interface found");
return ncclInternalError;
} else {
char line[1024];
char addrline[1024];
#define MAX_LINE_LEN (2047)
char line[MAX_LINE_LEN+1];
char addrline[SOCKET_NAME_MAXLEN+1];
line[0] = '\0';
addrline[SOCKET_NAME_MAXLEN] = '\0';
for (int i=0; i<ncclNetIfs; i++) {
strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
socketToString(&addrs[i].sa, addrline));
}
line[1023] = '\0';
line[MAX_LINE_LEN] = '\0';
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
}
}
@@ -113,8 +115,7 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
#define MAX_SOCKETS 64
#define MAX_THREADS 16
#define MAX_REQUESTS 128
#define MAX_QUEUE_LEN MAX_REQUESTS
#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
#define MIN_CHUNKSIZE (64*1024)
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
@@ -150,6 +151,7 @@ struct ncclSocketRequest {
struct ncclSocketTaskQueue {
int next;
int len;
struct ncclSocketTask* tasks;
};
@@ -189,7 +191,7 @@ void* persistentSocketThread(void *args_) {
while (1) {
int idle = 1;
int mark = myQueue->next; // mark newest task seen
for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
for (int i=0; i<myQueue->len; i+=nSocksPerThread) {
int repeat;
do {
repeat = 0;
@@ -364,7 +366,11 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
// create helper threads and prepare per-thread task queue
if (queue->tasks == NULL) {
NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
// each request can be divided up to nSocks tasks, and
// these tasks are distributed to nThreads threads,
// we need to make sure each thread queue has enough slots for MAX_REQUESTS
queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads);
NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
queue->next = 0;
res->comm = comm;
pthread_mutex_init(&res->threadLock, NULL);
@@ -383,7 +389,7 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
r->used = 1;
*req = r;
pthread_mutex_lock(&res->threadLock);
queue->next = (queue->next+1)%MAX_QUEUE_LEN;
queue->next = (queue->next+1)%queue->len;
res->state = start;
pthread_cond_signal(&res->threadCond);
pthread_mutex_unlock(&res->threadLock);
@@ -421,6 +427,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
// divide into subtasks
int chunkOffset = 0, i = 0;
if (r->comm->nSocks > 0) {
// each request can be divided up to nSocks tasks
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
while (chunkOffset < r->size) {
int chunkSize = std::min(taskSize, r->size-chunkOffset);
@@ -478,7 +485,7 @@ ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle
return ncclSuccess;
}
ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
// We don't support CUDA pointers, so we don't need a flush operation
return ncclInternalError;
}
@@ -527,7 +534,7 @@ ncclNet_t ncclNetSocket = {
ncclSocketDeregMr,
ncclSocketIsend,
ncclSocketIrecv,
ncclSocketFlush,
ncclSocketIflush,
ncclSocketTest,
ncclSocketClose,
ncclSocketClose,
+98 -109
Wyświetl plik
@@ -12,26 +12,30 @@
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#endif
#include "shm.h"
#include "bootstrap.h"
struct p2pConnectInfo {
int direct;
int rank;
int read;
union {
void* directPtr;
hipIpcMemHandle_t devIpc;
};
void* directPtr;
hipIpcMemHandle_t devIpc;
};
struct p2pSendResources {
struct ncclSendMem* devMem;
void* ipcPtr;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
int remoteId;
int memRank;
void* bootstrap;
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
void* ipcPtr;
int remoteId;
int memRank;
void* bootstrap;
};
#include <sys/types.h>
@@ -69,9 +73,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
}
// Check topology / p2p level.
int read;
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
int intermediateRank;
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
if (*ret == 0) return ncclSuccess;
if (intermediateRank != -1) return ncclSuccess;
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -114,31 +119,52 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
int readEnable = ncclParamP2pReadEnable();
if (readEnable != -2) return readEnable;
int p2p, read;
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
int p2p;
// Queries the topology to see if the GPUs are Ampere and
// connected via NVLink, if so we enable P2P Read by default
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
return read;
int readEnable = ncclParamP2pReadEnable();
if (readEnable != -2) *read = readEnable;
return ncclSuccess;
}
static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
if (myInfo->pidHash == peerInfo->pidHash) {
if (peerInfo->cudaDev != myInfo->cudaDev) {
// Enable P2P access
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == hipErrorPeerAccessAlreadyEnabled) {
hipGetLastError();
} else if (err != hipSuccess) {
WARN("failed to peer with device %d(=%lx): %d %s",
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
return ncclInternalError;
}
}
*devMem = p2pInfo->directPtr;
*ipcPtr = NULL;
} else {
CUDACHECK(hipIpcOpenMemHandle(devMem, p2pInfo->devIpc, hipIpcMemLazyEnablePeerAccess));
*ipcPtr = *devMem;
}
return ncclSuccess;
}
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct p2pSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead = p2pUseRead(topo, myInfo, peerInfo);
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
resources->next_hdp_reg = 0;
uint32_t linktype, hops;
@@ -154,116 +180,84 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
struct p2pConnectInfo info;
info.read = useRead;
const char* useReadStr = info.read ? "/read" : "";
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
return ncclInternalError;
resources->remoteId = -1;
resources->bootstrap = comm->bootstrap;
if (intermediateRank == -1) {
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true));
info.rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else {
// Enable P2P access
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == hipErrorPeerAccessAlreadyEnabled) {
hipGetLastError();
} else if (err != hipSuccess) {
WARN("failed to peer with device %d(=%lx): %d %s",
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != hipSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
//TRACE_DUMP_IPC(&info.devIpc);
NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
info.rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr);
}
resources->memRank = info.rank;
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
return ncclSuccess;
}
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
struct p2pRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead = p2pUseRead(topo, myInfo, peerInfo);
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
int recvSize = offsetof(struct ncclRecvMem, buff);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
struct p2pConnectInfo info;
info.read = useRead;
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
resources->remoteId = -1;
resources->bootstrap = comm->bootstrap;
if (intermediateRank == -1) {
NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true));
info.rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
} else {
// Enable P2P access
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == hipErrorPeerAccessAlreadyEnabled) {
hipGetLastError();
} else if (err != hipSuccess) {
WARN("failed to peer with device %d(=%lx): %d %s",
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != hipSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
//TRACE_DUMP_IPC(&info.devIpc);
NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
info.rank = intermediateRank;
}
resources->memRank = info.rank;
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
return ncclSuccess;
}
/* Connect/Send to this peer */
static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct ncclRecvMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclRecvMem*)(info->directPtr);
if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
if (err != hipSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, hipGetErrorString(err));
return ncclUnhandledCudaError;
}
}
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -283,26 +277,12 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
}
/* Connect/Recv from this peer */
ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct ncclSendMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclSendMem*)(info->directPtr);
if (info->read == 0) {
recv->conn.direct |= NCCL_DIRECT_GPU;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
}
} else {
//TRACE_DUMP_IPC(&info->devIpc);
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
remDevMem = (struct ncclSendMem*)resources->ipcPtr;
if (err != hipSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, hipGetErrorString(err));
return ncclUnhandledCudaError;
}
}
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -316,6 +296,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
}
recv->conn.tail = &resources->devMem->tail;
recv->conn.head = &remDevMem->head;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
return ncclSuccess;
}
@@ -323,6 +304,10 @@ ncclResult_t p2pSendFree(void* resources) {
struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
if (sendRes->ipcPtr)
CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
if (sendRes->remoteId != -1) {
NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
sendRes->devMem = NULL;
}
CUDACHECK(hipFree(sendRes->devMem));
free(sendRes);
return ncclSuccess;
@@ -332,6 +317,10 @@ ncclResult_t p2pRecvFree(void* resources) {
struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
if (recvRes->ipcPtr)
CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
if (recvRes->remoteId != -1) {
NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
recvRes->devMem = NULL;
}
CUDACHECK(hipFree(recvRes->devMem));
free(recvRes);
return ncclSuccess;
+4 -4
Wyświetl plik
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
@@ -81,7 +81,7 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
@@ -106,7 +106,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
}
/* Connect to this peer */
ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -131,7 +131,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
return ncclSuccess;
}
ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -65,13 +65,25 @@ BEGIN {
do {
match($col_1, /\[([0-9]+)\]/, ary)
chan=strtonum(ary[1])
match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
if(ary[8]!="-1")
treedns[ary[7] "," ary[8] "," chan]="1"
if(ary[9]!="-1")
treedns[ary[7] "," ary[9] "," chan]="1"
if(ary[10]!="-1")
treedns[ary[7] "," ary[10] "," chan]="1"
where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
if(where != 0) {
if(ary[8]!="-1")
treedns[ary[7] "," ary[8] "," chan]="1"
if(ary[9]!="-1")
treedns[ary[7] "," ary[9] "," chan]="1"
if(ary[10]!="-1")
treedns[ary[7] "," ary[10] "," chan]="1"
} else {
where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)/, ary)
if(where != 0) {
if(ary[1]!="-1")
treedns[ary[4] "," ary[1] "," chan]="1"
if(ary[2]!="-1")
treedns[ary[4] "," ary[2] "," chan]="1"
if(ary[3]!="-1")
treedns[ary[4] "," ary[3] "," chan]="1"
}
}
if(chan>max_treedn)
max_treedn=chan
col_1=col_1+2
+31 -5
Wyświetl plik
@@ -31,6 +31,32 @@
#include <list>
#include <iterator>
struct ibtestProxyArgs {
proxyProgressFunc_t progress;
struct ncclChannel* channel;
struct ncclConnector* connector;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int protocol;
ncclDataType_t dtype;
ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
// Internal state
uint64_t head;
uint64_t tail;
uint64_t end;
void* requests[NCCL_STEPS];
int idle;
// Element linking
pthread_mutex_t mutex;
struct ibtestProxyArgs* next;
struct ibtestProxyArgs* nextPeer;
};
ncclResult_t initNet();
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
@@ -204,7 +230,7 @@ private:
bool runSend;
bool use_gdr_read;
int sliceSteps;
struct ncclProxyArgs args;
struct ibtestProxyArgs args;
ncclResult_t connect(char* ip, uint16_t port) {
inet_pton(AF_INET, ip, &netConnectAddr.sin_addr);
@@ -326,7 +352,7 @@ public:
void launchKernel(uint64_t end) {
*sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
memset(&args, 0, sizeof(struct ncclProxyArgs));
memset(&args, 0, sizeof(struct ibtestProxyArgs));
args.head = 0;
args.tail = 0;
args.end = end;
@@ -365,7 +391,7 @@ private:
bool runRecv;
bool use_gdr_write;
int sliceSteps;
struct ncclProxyArgs args;
struct ibtestProxyArgs args;
ncclResult_t listen() {
printf("GDR Write %s\n", use_gdr_write ? "enabled" : "disabled");
@@ -472,7 +498,7 @@ public:
}
args.head += args.sliceSteps;
recv_byte += size;
NCCLCHECK(ncclNetFlush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
NCCLCHECK(ncclNetIflush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle, args.requests+buffSlot));
STORE(recvHead, args.head);
args.idle = 0;
}
@@ -486,7 +512,7 @@ public:
void launchKernel(uint64_t end) {
*recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
memset(&args, 0, sizeof(struct ncclProxyArgs));
memset(&args, 0, sizeof(struct ibtestProxyArgs));
args.head = 0;
args.tail = 0;
args.end = end;

Some files were not shown because too many files have changed in this diff Show More