Merge remote-tracking branch 'nccl/master' into no-target-id

[ROCm/rccl commit: d469947641]
2020-12-01 11:33:47 -05:00
commit adff98765c
@@ -89,10 +89,6 @@ set(CU_SOURCES
    src/collectives/device/broadcast.cu
    src/collectives/device/reduce_scatter.cu
    src/collectives/device/sendrecv.cu
-    src/collectives/device/gather.cu
-    src/collectives/device/scatter.cu
-    src/collectives/device/all_to_all.cu
-    src/collectives/device/all_to_allv.cu
    src/collectives/device/functions.cu)

 set(CPP_SOURCES)
@@ -11,6 +11,7 @@ KEEP ?= 0
 DEBUG ?= 0
 TRACE ?= 0
 PROFAPI ?= 0
+NVTX ?= 1

 NVCC = $(CUDA_HOME)/bin/nvcc

@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
 CXXFLAGS  += -DENABLE_TRACE
 endif

+ifeq ($(NVTX), 0)
+CXXFLAGS  += -DNVTX_DISABLE
+endif
+
 ifneq ($(KEEP), 0)
 NVCUFLAGS += -keep
 endif
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 7
-NCCL_PATCH   := 8
+NCCL_MINOR   := 8
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
 Section: libs
 Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: NVIDIA Collectives Communication Library (NCCL) Runtime
+Description: NVIDIA Collective Communication Library (NCCL) Runtime
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective
 communication routines for GPUs, implementing all-reduce, all-gather, reduce,
 broadcast, and reduce-scatter.
@@ -21,7 +21,7 @@ Package: libnccl-dev
 Section: libdevel
 Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
-Description: NVIDIA Collectives Communication Library (NCCL) Development Files
+Description: NVIDIA Collective Communication Library (NCCL) Development Files
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective
 communication routines for GPUs, implementing all-reduce, all-gather, reduce,
 broadcast, and reduce-scatter.
@@ -1,7 +1,7 @@
 Name:           libnccl
 Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime

 Group:          Development/Libraries
 License:        BSD
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
 sockets.

 %package devel
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 Group:          Development/Libraries
 %description devel
 NCCL development files

 %package static
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 Group:          Development/Libraries
 %description static
 NCCL static library
@@ -13,144 +13,77 @@
 #include <unistd.h>
 #include <sys/types.h>

-struct bootstrapNetComm {
-  int fd;
-};
-
 /* Init functions */
-static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
-static int bootstrapNetIfs = -1;
+static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
+static union socketAddress bootstrapNetIfAddr;
+static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;

 ncclResult_t bootstrapNetInit() {
-  if (bootstrapNetIfs == -1) {
+  if (bootstrapNetInitDone == 0) {
    pthread_mutex_lock(&bootstrapNetLock);
-    if (bootstrapNetIfs == -1) {
-      bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      if (bootstrapNetIfs <= 0) {
-        WARN("Bootstrap : no socket interface found");
-        return ncclInternalError;
-      } else {
-        char line[1024];
-        char addrline[1024];
-        line[0] = '\0';
-        for (int i=0; i<bootstrapNetIfs; i++) {
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
-              socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+    if (bootstrapNetInitDone == 0) {
+      char* env = getenv("NCCL_COMM_ID");
+      if (env) {
+        union socketAddress remoteAddr;
+        if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+          return ncclInvalidArgument;
+        }
+        if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+          WARN("NET/Socket : No usable listening interface found");
+          return ncclSystemError;
+        }
+      } else {
+        int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+        if (nIfs <= 0) {
+          WARN("Bootstrap : no socket interface found");
+          return ncclInternalError;
        }
-        line[1023] = '\0';
-        INFO(NCCL_INIT, "Bootstrap : Using%s", line);
      }
+      char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
+      sprintf(line, " %s:", bootstrapNetIfName);
+      socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
+      INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      bootstrapNetInitDone = 1;
    }
    pthread_mutex_unlock(&bootstrapNetLock);
  }
  return ncclSuccess;
 }

-static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
-  if (dev >= bootstrapNetIfs) return ncclInternalError;
-  memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
-  return ncclSuccess;
-}
-
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };

-static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
-  // if dev >= 0, listen based on dev
-  if (dev >= 0) {
-    NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
-  } else if (dev == findSubnetIf) {
-    // handle stores a remote address
-    // need to find a local addr that is in the same network as the remote addr
-    union socketAddress localAddr;
-    char ifName[MAX_IF_NAME_SIZE];
-    if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("NET/Socket : No usable listening interface found");
-      return ncclSystemError;
-    }
-    // pass the local address back
-    memcpy(connectAddr, &localAddr, sizeof(localAddr));
-  } // Otherwise, handle stores a local address
-  struct bootstrapNetComm* comm;
-  NCCLCHECK(bootstrapNetNewComm(&comm));
-  NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
-  *listenComm = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  struct bootstrapNetComm* comm;
-  NCCLCHECK(bootstrapNetNewComm(&comm));
-  NCCLCHECK(connectAddress(&comm->fd, connectAddr));
-  *sendComm = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
-  struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
-  struct bootstrapNetComm* rComm;
-  NCCLCHECK(bootstrapNetNewComm(&rComm));
+static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
  struct sockaddr_in sockaddr;
  socklen_t socklen = sizeof(struct sockaddr_in);
-  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
-  *recvComm = rComm;
+  SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
  return ncclSuccess;
 }

-static ncclResult_t bootstrapNetClose(void* opaqueComm) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
-  if (comm) {
-    close(comm->fd);
-    free(comm);
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
-
 // Additional sync functions
-static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
-  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
-  NCCLCHECK(socketSend(comm->fd, data, size));
+static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
+  NCCLCHECK(socketSend(fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(fd, data, size));
  return ncclSuccess;
 }
-static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
  int recvSize;
-  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
  if (recvSize > size) {
    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
    return ncclInternalError;
  }
-  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
+  NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
  return ncclSuccess;
 }

 struct extInfo {
  int rank;
  int nranks;
-  ncclNetHandle_t extHandleListenRoot;
-  ncclNetHandle_t extHandleListen;
+  union socketAddress extAddressListenRoot;
+  union socketAddress extAddressListen;
 };

 #include <sys/resource.h>
@@ -163,27 +96,29 @@ static ncclResult_t setFilesLimit() {
  return ncclSuccess;
 }

-static void *bootstrapRoot(void* listenComm) {
+static void *bootstrapRoot(void* args) {
+  int listenFd = (uint64_t)args;
+  ncclResult_t res = ncclSuccess;
+  int nranks = 0, c = 0;
  struct extInfo info;
-  ncclNetHandle_t *rankHandles = NULL;
-  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t zero = { 0 }; // for sanity checking
-  void* tmpComm;
-  ncclResult_t res;
+  union socketAddress *rankAddresses = NULL;
+  union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  union socketAddress *zero = NULL;
+  NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
  setFilesLimit();

  TRACE(NCCL_INIT, "BEGIN");
  /* Receive addresses from all ranks */
-  int nranks = 0, c = 0;
  do {
-    NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
+    int tmpFd;
+    NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
+    close(tmpFd);

    if (c == 0) {
      nranks = info.nranks;
-      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
-      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
    }

    if (nranks != info.nranks) {
@@ -191,14 +126,14 @@ static void *bootstrapRoot(void* listenComm) {
      goto out;
    }

-    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
      goto out;
    }

    // Save the connection handle for that rank
-    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
-    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
+    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));

    ++c;
    TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
@@ -208,44 +143,46 @@ static void *bootstrapRoot(void* listenComm) {
  // Send the connect handle for the next rank in the AllGather ring
  for (int r=0; r<nranks; ++r) {
    int next = (r+1) % nranks;
-    void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
+    int tmpSendFd;
+    NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
+    close(tmpSendFd);
  }
  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);

 out:
-  bootstrapNetCloseListen(listenComm);
-  if (rankHandles) free(rankHandles);
-  if (rankHandlesRoot) free(rankHandlesRoot);
+  close(listenFd);
+  if (rankAddresses) free(rankAddresses);
+  if (rankAddressesRoot) free(rankAddressesRoot);
+  if (zero) free(zero);

  TRACE(NCCL_INIT, "DONE");
  return NULL;
 }

 ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
-  void* listenComm;
-  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
+  union socketAddress* connectAddr = (union socketAddress*) id;
+  int listenFd;
+  NCCLCHECK(createListenSocket(&listenFd, connectAddr));
  pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, listenComm);
+  pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
  return ncclSuccess;
 }

 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
-  static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
  memset(id, 0, sizeof(ncclUniqueId));
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  union socketAddress* connectAddr = (union socketAddress*) id;

  char* env = getenv("NCCL_COMM_ID");
  if (env) {
    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    if (bootstrapNetCreateHandle(netHandle, env) != 0) {
+    if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
      return ncclInvalidArgument;
    }
  } else {
+    memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
    NCCLCHECK(bootstrapCreateRoot(id, false));
  }

@@ -254,24 +191,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {

 struct unexConn {
  int peer;
-  void* comm;
+  int fd;
  struct unexConn* next;
 };

-struct extState {
-  void* extBstrapListenComm;
-  void* extBstrapRingRecvComm;
-  void* extBstrapRingSendComm;
-  ncclNetHandle_t* peerBstrapHandles;
-  struct unexConn* unexpectedConnections;
-  int rank;
-  int nranks;
-  int dev;
+// Remote allocator state
+struct remAllocState {
+  int cudaDev;
+  int listenFd;
+  int stop;
 };

+struct extState {
+  int extListenFd;
+  int extRingRecvFd;
+  int extRingSendFd;
+  union socketAddress* peerCommAddresses;
+  union socketAddress* peerAllocAddresses;
+  struct unexConn* unexpectedConnections;
+  int cudaDev;
+  int rank;
+  int nranks;
+
+  // Intermediate memory allocation service
+  struct remAllocState* allocState;
+  pthread_t allocThread;
+};
+
+#define MAX_SEGMENTS 128
+
+static ncclResult_t remoteAlloc(void** ptr, int fd) {
+  size_t size;
+  NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
+  hipIpcMemHandle_t devIpc;
+  NCCLCHECK(ncclCudaCalloc((char**)ptr, size, true));
+  hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
+  if (res != hipSuccess) {
+    WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
+    hipFree(*ptr);
+    CUDACHECK(res);
+  }
+  // The CUDA IPC
+  NCCLCHECK(socketSend(fd, &devIpc, sizeof(hipIpcMemHandle_t)));
+  // And the direct pointer
+  NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
+  return ncclSuccess;
+}
+
+#include <poll.h>
+
+// Service thread to allocate memory for other GPUs, used as intermediate step.
+void* ncclRemoteMemAllocationService(void* args) {
+  struct remAllocState* state = (struct remAllocState *) args;
+  if (hipSetDevice(state->cudaDev) != hipSuccess) {
+    WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
+  }
+
+  // Prepare poll descriptor
+  void* segments[MAX_SEGMENTS];
+  struct pollfd pollfds[MAX_SEGMENTS+1];
+  for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
+  for (int s=0; s<MAX_SEGMENTS; s++) {
+    pollfds[s].fd = -1;
+    pollfds[s].events = POLLHUP;
+  }
+  pollfds[MAX_SEGMENTS].fd = state->listenFd;
+  pollfds[MAX_SEGMENTS].events = POLLIN;
+
+  int nbuffers = 0;
+  while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
+    if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
+      WARN("[Rem Allocator] Poll failed with error %d", error);
+      return NULL;
+    }
+    if (pollfds[MAX_SEGMENTS].revents) {
+      int s = 0;
+      while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
+      if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
+        pollfds[s].fd = -1;
+      } else {
+        if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
+          WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
+          close(pollfds[s].fd);
+          pollfds[s].fd = -1;
+        } else {
+          nbuffers++;
+        }
+      }
+    }
+    for (int s=0; s<MAX_SEGMENTS; s++) {
+      if (pollfds[s].revents & POLLHUP) {
+        if (hipFree(segments[s]) != hipSuccess) {
+          WARN("[Rem Allocator] hipFree %p failed", segments[s]);
+        }
+        segments[s] = NULL;
+        close(pollfds[s].fd);
+        pollfds[s].fd = -1;
+        nbuffers--;
+      }
+    }
+  }
+  for (int s=0; s<MAX_SEGMENTS; s++) {
+    if (segments[s]) hipFree(segments[s]);
+    close(pollfds[s].fd);
+  }
+  close(state->listenFd);
+  free(state);
+  return NULL;
+}
+
+ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
+  struct extState* state = (struct extState*)commState;
+  int fd;
+  ncclResult_t res;
+  *id = -1;
+  NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
+  NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
+  NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(hipIpcMemHandle_t)), res, end);
+  NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
+  *id = fd;
+end:
+  return res;
+}
+
+ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
+  SYSCHECK(close(id), "close");
+  return ncclSuccess;
+}
+
 ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
-  bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
  struct extState* state;
  NCCLCHECK(ncclCalloc(&state, 1));
  state->rank = rank;
@@ -283,19 +331,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
  struct extInfo info = { 0 };
  info.rank = rank;
  info.nranks = nranks;
-  void *tmpSendComm, *tmpRecvComm;
-  // Pass the remote address to listen via info
-  if (idFromEnv) {
-    memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
-  }
-  // listen will return the local address via info (specify interface type 'findSubnetIf')
-  state->dev = idFromEnv ? findSubnetIf : 0;
-  void* extBstrapListenCommRoot;
-  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
-  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+  int tmpSendFd, tmpRecvFd;

-  // stagger connection times to avoid an overload of the root at very high rank counts
+  int extListenFdRoot;
+  memcpy(&info.extAddressListen,     &bootstrapNetIfAddr, sizeof(union socketAddress));
+  memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
+  NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
+  NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
+
+  // stagger connection times to avoid an overload of the root
  if (nranks > 128) {
    long msec = rank;
    struct timespec tv;
@@ -306,25 +350,35 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
  }

  // send info on my listening socket to root
-  NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  union socketAddress* rootAddr = (union socketAddress*)id;
+  NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
+  close(tmpSendFd);

  // get info on my "next" rank in the bootstrap ring from root
-  ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
-  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
-  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
+  union socketAddress extAddressNext;
+  NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
+  close(tmpRecvFd);
+  close(extListenFdRoot);

-  NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
  // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+  NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));

  // AllGather all listen handlers
-  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
-  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
-  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+  NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
+  memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
+
+  // Create the memory allocation service
+  NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
+  memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
+  NCCLCHECK(ncclCalloc(&state->allocState, 1));
+  CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
+  NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
+  pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
+  NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));

  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);

@@ -348,9 +402,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
    size_t sslice = (rank - i + nranks) % nranks;

    // Send slice to the right
-    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
    // Recv slice from the left
-    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
  }

  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -359,20 +413,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {

 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
  struct extState* state = (struct extState*)commState;
-  void* tmpSendComm;
-  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
-  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  int tmpSendFd;
+  NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
+  close(tmpSendFd);
  return ncclSuccess;
 }

-ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
  // New unex
  struct unexConn* unex;
  NCCLCHECK(ncclCalloc(&unex, 1));
  unex->peer = peer;
-  unex->comm = comm;
+  unex->fd = fd;

  // Enqueue
  struct unexConn* list = state->unexpectedConnections;
@@ -385,7 +439,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
  return ncclSuccess;
 }

-void* unexpectedDequeue(struct extState* state, int peer) {
+int unexpectedDequeue(struct extState* state, int peer) {
  struct unexConn* elem = state->unexpectedConnections;
  struct unexConn* prev = NULL;
  while (elem) {
@@ -395,41 +449,41 @@ void* unexpectedDequeue(struct extState* state, int peer) {
      } else {
        prev->next = elem->next;
      }
-      void* comm = elem->comm;
+      int fd = elem->fd;
      free(elem);
-      return comm;
+      return fd;
    }
    prev = elem;
    elem = elem->next;
  }
-  return NULL;
+  return -1;
 }

 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
  struct extState* state = (struct extState*)commState;

-  void* tmpRecvComm;
+  int tmpRecvFd;

  // Search unexpected connections first
-  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
-    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
-    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
+    close(tmpRecvFd);
    return ncclSuccess;
  }

  // Then look for new connections
  while (1) {
-    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
    int newPeer;
-    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
    if (newPeer == peer) {
-      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
-      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
+      close(tmpRecvFd);
      return ncclSuccess;
    }
    // Unexpected connection. Save for later.
-    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
  }
 }

@@ -439,11 +493,17 @@ ncclResult_t bootstrapClose(void* commState) {
    WARN("Unexpected connections are not empty.\n");
    return ncclInternalError;
  }
-  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
-  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
+  close(state->extListenFd);
+  close(state->extRingSendFd);
+  close(state->extRingRecvFd);

-  free(state->peerBstrapHandles);
+  state->allocState->stop = 1;
+
+  // Join the allocThread so we catch resource leaks as being hung here
+  // pthread_join(state->allocThread, nullptr);
+
+  free(state->peerCommAddresses);
+  free(state->peerAllocAddresses);
  free(state);

  return ncclSuccess;
@@ -451,10 +511,12 @@ ncclResult_t bootstrapClose(void* commState) {

 ncclResult_t bootstrapAbort(void* commState) {
  struct extState* state = (struct extState*)commState;
-  bootstrapNetCloseListen(state->extBstrapListenComm);
-  bootstrapNetCloseSend(state->extBstrapRingSendComm);
-  bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
-  free(state->peerBstrapHandles);
+  close(state->extListenFd);
+  close(state->extRingSendFd);
+  close(state->extRingRecvFd);
+  state->allocState->stop = 2;
+  free(state->peerCommAddresses);
+  free(state->peerAllocAddresses);
  free(state);
  return ncclSuccess;
 }
@@ -26,16 +26,14 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
  }

  // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
-  NCCLCHECK(ncclCudaHostCalloc(&channel->collectivesExtra, comm->nRanks*NCCL_MAX_OPS*4));
+  NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
  return ncclSuccess;
 }

 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  if (channel->id == -1) return ncclSuccess;
  // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->collectivesExtra));
-  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+  NCCLCHECK(ncclCudaHostFree(channel->workFifo));

  // Free Ring index to rank tables
  free(channel->ring.userRanks);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
  return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
  return ncclEnqueueCheck(&info);
@@ -25,9 +25,10 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
    NCCLCHECK(ncclGroupEnd());
    return ncclSuccess;
  } else {
-    struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
-      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
-    return ncclEnqueueCheck(&info);
+    //struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
+    //  sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+    //  ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+    //return ncclEnqueueCheck(&info);
+    return ncclInternalError;
  }
 }
@@ -37,9 +37,10 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
    NCCLCHECK(ncclGroupEnd());
    return ncclSuccess;
  } else {
-    struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
-      sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
-    return ncclEnqueueCheck(&info);
+    //struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
+    //  sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
+    //  ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
+    //return ncclEnqueueCheck(&info);
+    return ncclInternalError;
  }
 }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
  return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
+IMPL_COLL_C(AllGather);
@@ -9,206 +9,201 @@
 #include "primitives.h"
 #include "collectives.h"

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + bid*realChunkSize;

-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(realChunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(realChunkSize, size-chunkOffset);
+        int rankDest;

-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;

-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      prims.directSend(thisInput+chunkOffset, offset, nelem);
-    } else {
-      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          prims.directSend(thisInput+chunkOffset, offset, nelem);
+        } else {
+          prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+        }
+
+        // Make final copy from buffer to dest.
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        // Final wait/copy.
+        prims.directRecv(thisOutput+offset, offset, nelem);
+      }
    }
+};

-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;

-      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;
+
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          LLprims.send(thisInput+chunkOffset, nelem);
+        } else {
+          LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+
+        // step k-1: final store
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recv(thisOutput+offset, nelem);
+      }
    }
-
-    // Make final copy from buffer to dest.
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    // Final wait/copy.
-    prims.directRecv(thisOutput+offset, offset, nelem);
-  }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
-    }
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LLprims.send(thisInput+chunkOffset, nelem);
-    } else {
-      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
-    }
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
-
-    // step k-1: final store
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+};

 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;

-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);

-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;

-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;

-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;

-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LLprims.send(thisInput+chunkOffset, nelem);
-    } else {
-      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          LLprims.send(thisInput+chunkOffset, nelem);
+        } else {
+          LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+
+        // step k-1: final store
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recv(thisOutput+offset, nelem);
+      }
    }
+};

-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};

-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};

-    // step k-1: final store
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
+IMPL_COLL_R(AllReduce);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
+IMPL_COLL_C(Broadcast);
@@ -9,177 +9,155 @@
 #include "primitives.h"
 #include "collectives.h"

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
-#ifdef ENABLE_PROFILING
-  auto devProf = comm->devProf;
-  uint64_t clk, t0 = 0ULL, ws;
-  if (tid == 0) clk = __rtc64();
-#endif
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*realChunkSize;
-    int nelem = min(realChunkSize, size-offset);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);

-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        INIT_COUNTER;
-        prims.send(thisInput+offset, nelem);
-        ACCUMULATE_COUNTER(send);
-      } else {
-        INIT_COUNTER;
-        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
-        ACCUMULATE_COUNTER(copySend);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            prims.send(thisInput+offset, nelem);
+          } else {
+            prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+          }
+        } else if (nextRank == root) {
+          prims.recv(thisOutput+offset, nelem);
+        } else {
+          prims.recvCopySend(thisOutput+offset, nelem);
+        }
      }
-    } else if (nextRank == root) {
-      INIT_COUNTER;
-      prims.recv(thisOutput+offset, nelem);
-      ACCUMULATE_COUNTER(recv);
-    } else {
-      INIT_COUNTER;
-      prims.recvCopySend(thisOutput+offset, nelem);
-      ACCUMULATE_COUNTER(recvCopySend);
    }
-  }
-#ifdef ENABLE_PROFILING
-  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
-#endif
-}
+};

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t offset = gridOffset + bid*chunkSize;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
-    }
-    ssize_t offset = gridOffset + bid*chunkSize;
-
-    int nelem = min(chunkSize, size-offset);
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            LLprims.send(thisInput+offset, nelem);
+          } else {
+            LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+          }
+        } else if (nextRank == root) {
+          LLprims.recv(thisOutput + offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput + offset, nelem);
+        }
      }
-    } else if (nextRank == root) {
-      LLprims.recv(thisOutput + offset, nelem);
-    } else {
-      LLprims.recvCopySend(thisOutput + offset, nelem);
    }
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+};

 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;

-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
-    ssize_t offset = gridOffset + bid*chunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+        ssize_t offset = gridOffset + bid*chunkSize;

-    int nelem = min(chunkSize, size-offset);
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            LLprims.send(thisInput+offset, nelem);
+          } else {
+            LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+          }
+        } else if (nextRank == root) {
+          LLprims.recv(thisOutput + offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput + offset, nelem);
+        }
      }
-    } else if (nextRank == root) {
-      LLprims.recv(thisOutput + offset, nelem);
-    } else {
-      LLprims.recvCopySend(thisOutput + offset, nelem);
    }
-  }
-}
+};

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};
@@ -11,122 +11,95 @@
 #include "collectives.h"
 #include "devcomm.h"

-__device__
-inline  __attribute((always_inline))
-long long int __rtc64() {
-#if __HIP__
-  return (long long int) __builtin_amdgcn_s_memrealtime();
-#else
-  return (long long int) __clock_u64();
-#endif
-}
+#define COLL_UNROLL 2
+#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY

 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
 // Each thread sets a predicate to true if abort == 1
 // all CTA's threads enter the barrier and do a popc on their predicates being True
 // If any of the thread's predicate was True, all the threads call exit()
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define exitIfAbortBarrier(abort, abortCount) \
  if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
  __syncthreads(); \
  if (LOAD(abortCount)) { /*asm volatile ("s_endpgm");*/ return false; }
 #define __syncwarp()
-#else
-static inline __device__ void exitIfAbortBarrier(int abort) {
-  uint32_t popc;
-  asm ("{");
-  asm volatile ("   .reg .pred barr_pred;");
-  asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
-  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
-  asm ("}");
-  if (popc) { asm volatile ("exit;"); }
-}
-#endif

-#define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll##LL, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype), \
-  NCCL_COLL_NAME(coll, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, type) \
+  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
+  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
+  NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)

-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##CollNet, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+  NCCL_FUNC5(func, TREE,    redop, type), \
+  NCCL_FUNC5(func, RING,    redop, type), \
+  NCCL_FUNC5(func, COLLNET, redop, type)

 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64), \
-  NCCL_FUNC4(coll, op, b16)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
+#define NCCL_FUNCS3A(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, uint8_t), \
+  NCCL_FUNC4(func, redop, int32_t), \
+  NCCL_FUNC4(func, redop, uint32_t), \
+  NCCL_FUNC4(func, redop, int64_t), \
+  NCCL_FUNC4(func, redop, uint64_t), \
+  NCCL_FUNC4(func, redop, half), \
+  NCCL_FUNC4(func, redop, float), \
+  NCCL_FUNC4(func, redop, double), \
+  NCCL_FUNC4(func, redop, rccl_bfloat16)
+#define NCCL_FUNCS3B(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t)

 // Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+  NCCL_FUNCS3A(func, Sum ), \
+  NCCL_FUNCS3A(func, Prod), \
+  NCCL_FUNCS3A(func, Max ), \
+  NCCL_FUNCS3A(func, Min )
+#define NCCL_FUNCS2B(func) \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum)

 // Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
-  NCCL_FUNCS2B(ncclBroadcast), \
-  NCCL_FUNCS2A(ncclReduce), \
-  NCCL_FUNCS2B(ncclAllGather), \
-  NCCL_FUNCS2A(ncclReduceScatter), \
-  NCCL_FUNCS2A(ncclAllReduce), \
-  NCCL_COLL_NAME(ncclGather, copy, i8), \
-  NCCL_COLL_NAME(ncclScatter, copy, i8), \
-  NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
-  NCCL_COLL_NAME(ncclAllToAllv, copy, i8), \
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
+  NCCL_FUNCS2B(Broadcast), \
+  NCCL_FUNCS2A(Reduce), \
+  NCCL_FUNCS2B(AllGather), \
+  NCCL_FUNCS2A(ReduceScatter), \
+  NCCL_FUNCS2A(AllReduce), \
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) }

 // Must be consistent with the ncclFuncSet enum
-using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
+using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);

 static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
 #if defined(__HIP_DEVICE_COMPILE__)
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce),
-  NCCL_COLL_NAME(ncclGather, copy, i8),
-  NCCL_COLL_NAME(ncclScatter, copy, i8),
-  NCCL_COLL_NAME(ncclAllToAll, copy, i8),
-  NCCL_COLL_NAME(ncclAllToAllv, copy, i8),
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8)
+  NCCL_FUNCS2B(Broadcast),
+  NCCL_FUNCS2A(Reduce),
+  NCCL_FUNCS2B(AllGather),
+  NCCL_FUNCS2A(ReduceScatter),
+  NCCL_FUNCS2A(AllReduce),
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
 #endif
 };

 template<unsigned short f, unsigned short l>
 struct Caller {
  static __device__ __host__
-  void call(ncclColl* const c) noexcept
+  void call(struct ncclWorkElem* const c) noexcept
  {
    constexpr unsigned short m = f + (l - f) / 2;

@@ -137,78 +110,72 @@ struct Caller {
 template<unsigned short f>
 struct Caller<f, f + 1>{
  static __device__ __host__
-  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
+  void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
 };

 inline
 __device__
-void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
+void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
  if (c->funcIndex < 360) {
-    if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
-    else ncclBroadcastCollNet_copy_i8(&c->args);
+    if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
+    else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
  }
  else if (c->funcIndex < 720) Caller<360, 720>::call(c);
  else if (c->funcIndex < 1080) {
-    if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
-    else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
-    else ncclAllGatherCollNet_copy_i8(&c->args);
+    if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
+    else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
+    else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
  }
  else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
-  else if (c->funcIndex == 1800) {
-    ncclGather_copy_i8(&c->args);
-  }
-  else if (c->funcIndex == 1801) {
-    ncclScatter_copy_i8(&c->args);
-  }
-  else if (c->funcIndex == 1802) {
-    ncclAllToAll_copy_i8(&c->args);
-  }
-  else if (c->funcIndex == 1803) {
-    ncclAllToAllv_copy_i8(&c->args);
-  }
-  else ncclSendRecv_copy_i8(&c->args);
+  else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
 }

-static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
  int* d = (int*)dst;
  int* s = (int*)src;
  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
 }

-static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
-  // Check whether the last operation was aborted and make sure all threads exit
-  int abort = tid == 0 ? *(comm->abortFlag) : 0;
-  exitIfAbortBarrier(abort, abortCount);
-  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
+static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
  __syncthreads();
-  if (tid == 0) hostColl->active = 0;
+  load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
+  // Check whether the last operation was aborted and make sure all threads exit
+  int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
+  exitIfAbortBarrier(abort, abortCount);
+  if (tid == 0) hostWork->elems[0].active = 0;
  return true;
 }

+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {}
+};
+
 #ifdef ENABLE_COLLTRACE
 #define traceColl(fIdx)  \
    uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    comm->collTrace[pos].timeStamp = __rtc64(); \
-    comm->collTrace[pos].opCount = localColl.args.opCount; \
+    comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    comm->collTrace[pos].opCount = w->opCount; \
    comm->collTrace[pos].bid = bid; \
    comm->collTrace[pos].funcIndex = fIdx;
 #define traceKernelLaunch(fIdx)  { \
    traceColl(fIdx); \
-    comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
+    comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
  }
 #define traceCollEnd(fIdx)  { \
    traceColl(fIdx); \
@@ -218,124 +185,159 @@ static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* ho
    traceColl(fIdx); \
    comm->collTrace[pos].type = ncclCollTraceAbortType; \
  }
+//  traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
+#define traceData(data2, data4, data8_0, data8_1) { \
+    uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    comm->collTrace[pos].bid = blockIdx.x; \
+    comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    comm->collTrace[pos].funcIndex = data2; \
+    comm->collTrace[pos].data_0 = data4; \
+    comm->collTrace[pos].opCount = data8_0; \
+    comm->collTrace[pos].data_1 = data8_1; \
+    comm->collTrace[pos].type = ncclCollTraceDataType; \
+  }
 #else
-#define traceKernelLaunch()
-#define traceCollEnd()
-#define traceAbort()
+#define traceKernelLaunch(fIdx)
+#define traceCollEnd(fIdx)
+#define traceAbort(fIdx)
+#define traceData(data2, data4, data8_0, data8_1)
 #endif

-extern __device__ volatile uint64_t* ncclShmem;
+#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)

+struct ncclShmemPtrs {
+  void* srcs[NCCL_MAX_DEV_ARITY+1];
+  void* dsts[NCCL_MAX_DEV_ARITY+1];
+  uint64_t barrier;
+  uint64_t barrier_next[MAXWARPS];
+};
+
+struct ncclShmemData {
+  union {
 #ifdef ENABLE_LL128
-#define ALLOCATE_SHMEM \
-  __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
-  ncclShmem = shmem; \
-  __shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE];
+    volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
 #else
-#define ALLOCATE_SHMEM \
-  uint32_t* sync = 0;
+    volatile uint64_t* data;
 #endif
+    struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
+  };
+  uint32_t sync[MAXWARPS];
+  struct ncclWork localWork;
+};

-/* Functions for aggregation case */
-#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
-__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
+extern __device__ struct ncclShmemData *ncclShmem;
+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX, bool COLLTRACE>
+__device__ void ncclKernel(struct ncclWorkElem first)  {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  __shared__ struct ncclShmemData shmem;
+  ncclShmem = &shmem;
+  __shared__ uint32_t abortCount;
+  if (tid == 0) {
+    abortCount = 0;
+    for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
+      shmem.ptrs[i].barrier = 0;
+      for (auto j = 0; j < MAXWARPS; j++) shmem.ptrs[i].barrier_next[j] = 0;
+    }
+  }
+  __syncthreads();
+
+  auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
+
+  struct ncclDevComm* comm = first.comm;
+  struct ncclChannel* channel = comm->channels+bid;
+  struct ncclWorkElem* w = NULL;
+  uint16_t index = first.index;
+  bool firstLaunch = true;
+
+  if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
+
+  while (1) {
+    if (w == NULL) {
+      w = shmem.localWork.elems;
+      if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
+        if (COLLTRACE && tid == 0) traceAbort(-1);
+        return;
+      }
+      if (COLLTRACE && tid == 0) {
+        if (firstLaunch) traceKernelLaunch(w->funcIndex);
+        if (!firstLaunch) traceCollEnd(w->funcIndex);
+        firstLaunch = false;
+      }
+    } else {
+      if (COLLTRACE && tid == 0) {
+        traceKernelLaunch(w->funcIndex);
+        firstLaunch = false;
+      }
+    }
+    if (tid < w->nThreads) {
+      if (w->funcIndex == FINDEX) {
+        f.run(w);
+      } else {
+        NCCL_CALL_FUNCTIONS(w);
+      }
+    }
+    index = (index+1) % NCCL_MAX_OPS;
+    if (w->active == 2) {
+      if (COLLTRACE && tid == 0) traceCollEnd(-1);
+      return;
+    }
+    w = NULL;
+  }
 }

-/* Kernels with the first operation inlined */
-#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
-  int tid = threadIdx.x; \
-  int bid = blockIdx.x; \
-  ALLOCATE_SHMEM; \
-  __shared__ struct ncclColl localColl; \
-  __shared__ uint32_t abortCount; \
-  __shared__ uint64_t barrier[MAXBARRIERS]; \
-  __shared__ uint64_t barrier_next[MAXBARRIERS*MAXWARPS]; \
-  if (tid == 0) abortCount = 0; \
-  __syncthreads(); \
- \
-  struct ncclChannel* channel = comm->channels+bid; \
-  if (tid == 0) { \
-    channel->sync = sync; \
-    channel->barrier = barrier; \
-    channel->barrier_next = barrier_next; \
-    for (auto i = 0; i < MAXBARRIERS; i++) barrier[i] = 0; \
-    for (auto i = 0; i < MAXBARRIERS*MAXWARPS; i++) barrier_next[i] = 0; \
-  } \
-  if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
-    if (tid == 0) traceAbort(-1); \
-    return; \
-  } \
-  if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
-  while (1) { \
-    if (tid < localColl.args.common.nThreads) { \
-      if (localColl.funcIndex == fIndex) { \
-        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
-      } else { \
-        NCCL_CALL_FUNCTIONS(&localColl); \
-      } \
-    } \
-    int nextIndex = localColl.nextIndex; \
-    if (tid == 0) channel->collFifoHead = nextIndex; \
- \
-    if (localColl.active == 2) { \
-      if (tid == 0) traceCollEnd(-1); \
-      return; \
-    } \
- \
-    /* Load next collective operation*/ \
-    if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
-      if (tid == 0) traceAbort(-1); \
-      break; \
-    } \
-    if (tid == 0) traceCollEnd(localColl.funcIndex); \
-  } \
+__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
+  if (first.comm->collTraceThread) \
+    ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, true>(first); \
+  else \
+    ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, false>(first); \
 }

-#define IMPL_COLL_KERN_sum(coll, op, ncclFunc, dtype, ctype, fIndex) \
-  IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
-#define IMPL_COLL_KERN_copy(coll, op, ncclFunc, dtype, ctype, fIndex) \
-  IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
-#define IMPL_COLL_KERN_prod(coll, op, ncclFunc, dtype, ctype, fIndex)
-#define IMPL_COLL_KERN_min(coll, op, ncclFunc, dtype, ctype, fIndex)
-#define IMPL_COLL_KERN_max(coll, op, ncclFunc, dtype, ctype, fIndex)
+// Examples :     AllReduce, RING, LL,    Sum,   uint8
+/* Functions for aggregation case */
+#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
+__device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
+  auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
+  f.run(args); \
+}

 // Only generate inline kernels for LL
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
-  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL4(func, algo, redop, type, ncclType) \
+  IMPL_COLL_FUNC(func, algo, LL,     redop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \

-#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
-  IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
+#define IMPL_COLL3(func, redop, type, ncclType) \
+  IMPL_COLL4(func, TREE,    redop, type, ncclType) \
+  IMPL_COLL4(func, RING,    redop, type, ncclType) \
+  IMPL_COLL4(func, COLLNET, redop, type, ncclType)

-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
-  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
-  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
-  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
-  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
-  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
-  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
-  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
-  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64) \
-  IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
+#define IMPL_COLL2(func, redop) \
+  IMPL_COLL3(func, redop, int8_t,   ncclInt8) \
+  IMPL_COLL3(func, redop, uint8_t,  ncclUint8) \
+  IMPL_COLL3(func, redop, int32_t,  ncclInt32) \
+  IMPL_COLL3(func, redop, uint32_t, ncclUint32) \
+  IMPL_COLL3(func, redop, int64_t,  ncclInt64) \
+  IMPL_COLL3(func, redop, uint64_t, ncclUint64) \
+  IMPL_COLL3(func, redop, half,     ncclFloat16) \
+  IMPL_COLL3(func, redop, float,    ncclFloat32) \
+  IMPL_COLL3(func, redop, double,   ncclFloat64) \
+  IMPL_COLL3(func, redop, rccl_bfloat16,   ncclBfloat16)

 // Reduction define all functions
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, sum,  FuncSum,  colln, ncclSum); \
-  IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
-  IMPL_COLL2(collf, min,  FuncMin,  colln, ncclMin); \
-  IMPL_COLL2(collf, max,  FuncMax,  colln, ncclMax);
+#define IMPL_COLL_R(func) \
+  IMPL_COLL2(func, Sum) \
+  IMPL_COLL2(func, Prod) \
+  IMPL_COLL2(func, Min) \
+  IMPL_COLL2(func, Max)

-// Copy primitives only define one
-#define IMPL_COLL_C(collf, colln) \
-  IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+// Copy primitives only define one function for copy
+#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);

-#define COLL_UNROLL 2
+// Point-to-point primitives only have one function/kernel.
+#define IMPL_COLL_P(func) \
+  IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
+  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);

 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -282,28 +282,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
 #endif
 }

-template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
-    const int offset, const int N) {
-  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
-    T val = vFetch(srcs[0]+idx);
-    #pragma unroll
-    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
-    #pragma unroll 1
-    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
+    int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const T* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
+  T* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
+
+  while (offset < Nelem) {
+    T vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);

    #pragma unroll
-    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
-    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+    for (int i=1; i<MINSRCS; i++) {
+      T vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+    }
+    #pragma unroll
+    for (int i=MINSRCS; i<MAXSRCS; i++) {
+      if (i<nsrcs) {
+        T vals2[UNROLL];
+        for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+        for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+      }
+    }
+
+    // Store
+    #pragma unroll
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll
+    for (int i=MINDSTS; i<MAXDSTS; i++) {
+      if (i<ndsts) {
+        for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+      }
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
  }
 }

 template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
-    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
-    const int elemOffset, const int Npack) {
+__device__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
+    int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
  const int inc = nw * UNROLL * WARP_SIZE;
  int offset = w * UNROLL * WARP_SIZE + t;

@@ -334,8 +363,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
    }
    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
-      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    for (int i=MINDSTS; i<MAXDSTS; i++) {
+      if (i<ndsts) {
+        for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+      }
    }
    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
@@ -343,85 +374,73 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
  }
 }

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 template <typename T>
 __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
-#else
-__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+#define PACKELEMS (sizeof(Pack128) / sizeof(T))
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+// Multiply UNROLL by 2 if single source/single destination
+#define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
 #endif

-// Try to limit consecutive load/stores to 8.
-// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
-#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
-
 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T** srcs, int ndsts, T** dsts,
    int N) {
  int Nrem = N;
  if (Nrem <= 0) return;

-  int alignDiff = 0;
-  int align = ptrAlign128(srcs[0]);
-  #pragma unroll
-  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  #pragma unroll
-  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  int Npreamble = alignDiff ? Nrem :
-    N < alignof(int32_t) ? N :
-    (alignof(int32_t) - align) % alignof(int32_t);
-#else
-  int Npreamble = alignDiff ? Nrem :
-    N < alignof(Pack128) ? N :
-    (alignof(Pack128) - align) % alignof(Pack128);
-#endif
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  if (Npreamble) {
-    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
-    Nrem -= Npreamble;
-    if (Nrem == 0) return;
-  }
-  int offset = Npreamble;
-
-  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 128-bit alignable.
  int w = tid / WARP_SIZE;       // Warp number
  int nw = nthreads / WARP_SIZE; // Number of warps
  int t = tid % WARP_SIZE;       // Thread (inside the warp)

-  const int packFactor = sizeof(Pack128) / sizeof(T);
+  // Check that all is 16B aligned. If not don't use 16B load/stores.
+  int align = 0;
+  #pragma unroll
+  for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);

-  // stage 2a: main loop
-  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
-      * (AUTOUNROLL * WARP_SIZE); // round down
-  int Nelem2a = Npack2a * packFactor;
+  int offset = 0;
+  if (align == 0) {
+    // fast path: use 128b loads/stores to do the bulk of the work,
+    // assuming the pointers we have are all 128-bit aligned.

-  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+    // main loop
+    int Npack = (Nrem / (PACKELEMS*AUTOUNROLL*WARP_SIZE)) * (AUTOUNROLL*WARP_SIZE); // round down
+    int Nelem = Npack * PACKELEMS;

-  Nrem -= Nelem2a;
+    ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
+
+    Nrem -= Nelem;
+    if (Nrem == 0) return;
+    offset += Nelem;
+
+    // slightly less optimized for section when we don't have full unrolling
+    Npack = Nrem / PACKELEMS;
+    Nelem = Npack * PACKELEMS;
+
+    ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
+
+    Nrem -= Nelem;
+    if (Nrem == 0) return;
+    offset += Nelem;
+  }
+
+  // unrolled, by-type (mostly for unaligned buffers)
+  int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
+
+  ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
+
+  Nrem -= Nelem;
  if (Nrem == 0) return;
-  offset += Nelem2a;
+  offset += Nelem;

-  // stage 2b: slightly less optimized for section when we don't have full
-  // unrolling
-
-  int Npack2b = Nrem / packFactor;
-  int Nelem2b = Npack2b * packFactor;
-
-  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
-
-  Nrem -= Nelem2b;
-  if (Nrem == 0) return;
-  offset += Nelem2b;
-
-  // stage 2c: tail
-  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+  // no unroll, by type. Should finish what's remaining.
+  ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }

 #endif // COMMON_KERNEL_H_
@@ -9,62 +9,62 @@
 #include "collectives.h"
 #include "common.h"

-__device__ volatile uint64_t* ncclShmem;
+__device__ struct ncclShmemData* ncclShmem;

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
-NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll##LL, op, dtype), \
-  NCCL_COLL_NAME(coll##LL128, op, dtype), \
-  NCCL_COLL_NAME(coll, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, type) \
+  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
+  NCCL_FUNC_NAME(func, algo, LL128,  redop, type), \
+  NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)

-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##CollNet, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+  NCCL_FUNC5(func, TREE,    redop, type), \
+  NCCL_FUNC5(func, RING,    redop, type), \
+  NCCL_FUNC5(func, COLLNET, redop, type)

 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
+#define NCCL_FUNCS3A(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, uint8_t), \
+  NCCL_FUNC4(func, redop, int32_t), \
+  NCCL_FUNC4(func, redop, uint32_t), \
+  NCCL_FUNC4(func, redop, int64_t), \
+  NCCL_FUNC4(func, redop, uint64_t), \
+  NCCL_FUNC4(func, redop, half), \
+  NCCL_FUNC4(func, redop, float), \
+  NCCL_FUNC4(func, redop, double)
+#define NCCL_FUNCS3B(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t)

 // Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+  NCCL_FUNCS3A(func, Sum ), \
+  NCCL_FUNCS3A(func, Prod), \
+  NCCL_FUNCS3A(func, Max ), \
+  NCCL_FUNCS3A(func, Min )
+#define NCCL_FUNCS2B(func) \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum)

 // Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
-  NCCL_FUNCS2B(ncclBroadcast), \
-  NCCL_FUNCS2A(ncclReduce), \
-  NCCL_FUNCS2B(ncclAllGather), \
-  NCCL_FUNCS2A(ncclReduceScatter), \
-  NCCL_FUNCS2A(ncclAllReduce) }
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
+  NCCL_FUNCS2B(Broadcast), \
+  NCCL_FUNCS2A(Reduce), \
+  NCCL_FUNCS2B(AllGather), \
+  NCCL_FUNCS2A(ReduceScatter), \
+  NCCL_FUNCS2A(AllReduce) }

 // Must be consistent with the ncclFuncSet enum
 __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
@@ -72,12 +72,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
 #if __CUDA_ARCH__
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8),
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  NCCL_FUNCS2B(Broadcast),
+  NCCL_FUNCS2A(Reduce),
+  NCCL_FUNCS2B(AllGather),
+  NCCL_FUNCS2A(ReduceScatter),
+  NCCL_FUNCS2A(AllReduce)
 #endif
 };
 #endif
@@ -9,29 +9,9 @@
 #define OP128_H_

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
-  v0=LOAD(ptr);
-  v1=LOAD(ptr+1);
-}
-
-inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
-  STORE(ptr, v0);
-  STORE(ptr+1, v1);
-}
-
 inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
  return (uint64_t*)shmemGenericPtr;
 }
-
-inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
-  v0=LOAD(shmemAsmPtr);
-  v1=LOAD(shmemAsmPtr+1);
-}
-
-inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
-  STORE(shmemAsmPtr, v0);
-  STORE(shmemAsmPtr+1, v1);
-}
 #else
 inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
@@ -32,87 +32,79 @@
  } \
 } while (0)

-#define barrier_by_id(id) do { \
+#define barrier_by_group() do { \
  const int w = threadIdx.x/WARP_SIZE; \
-  barrier_next[id*MAXWARPS+w] += nthreads/WARP_SIZE; \
-  __atomic_fetch_add(barriers+id, 1, __ATOMIC_SEQ_CST); \
-  while (LOAD(barriers+id) < barrier_next[id*MAXWARPS+w]) /* spin */; \
+  const int wid = threadIdx.x%WARP_SIZE; \
+  if (wid == 0) { \
+    barrier_next[w] += nthreads/WARP_SIZE; \
+    __atomic_fetch_add(barriers, 1, __ATOMIC_SEQ_CST); \
+    while (LOAD(barriers) < barrier_next[w]) /* spin */; \
+  } \
 } while (0)

+#define ROLE_SRC       0x01
+#define ROLE_DST       0x02
+#define ROLE_WAIT_RECV 0x04
+#define ROLE_WAIT_SEND 0x08
+#define ROLE_POST_SEND 0x10
+#define ROLE_POST_RECV 0x20
+
 // Implementation of primitive types
 template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
 class ncclPrimitives {
 private:
  const int tid;
-  const int nthreads;
-  const int wid;
+  int nthreads;
+  int nworkers;
  const int stepSize;
  int nrecv = 0;
  int nsend = 0;
-  struct ncclConnInfo* recvConn = NULL;
-  volatile uint64_t* recvConnHeadPtr = NULL;
-  uint64_t recvConnHead;
-  volatile uint64_t* recvConnTailPtr = NULL;
-  uint64_t recvConnTail;
-  uint64_t recvConnTailCache; // Cache last seen value
+  struct ncclConnInfo* conn = NULL;
+  volatile int* connSizesFifoPtr = NULL;
+  void** connPtrsFifoPtr = NULL;
+  volatile uint64_t* connHeadPtr = NULL;
+  volatile uint64_t* connTailPtr = NULL;
+  uint64_t connTailCache; // Cache last seen value
+  uint64_t connHeadCache; // Cache last seen value

-  struct ncclConnInfo* sendConn = NULL;
-  volatile int* sendConnFifoPtr = NULL;
-  volatile uint64_t* sendConnTailPtr = NULL;
-  uint64_t sendConnTail;
-  volatile uint64_t* sendConnHeadPtr = NULL;
-  uint64_t sendConnHead;
-  uint64_t sendConnHeadCache; // Cache last seen value
-
-  uint64_t recvStep[NRECV];
-  uint64_t sendStep[NSEND];
-#if defined(RCCL_USE_DIRECT_BUFFER)
-  const T* recvDirectBuff[NRECV];
-  T* sendDirectBuff[NSEND];
-#endif
-  const T* recvBuff[NRECV];
-  T* sendBuff[NSEND];
+  int index; // Peer index I'm responsible for
+  int peer = -1;
+  int role = 0;
+  int group;
+  uint64_t step;
+  T* direct = NULL;
+  T* buff;
  struct ncclDevComm* comm;

-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
-  inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+  const T** srcs;
+  T** dsts;

  uint64_t* barriers;
  uint64_t* barrier_next;
-
+  // Don't use barrier 0 as it's used by the final sync
  inline __device__ void barrier() {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    if (wid == 0) {
-      if (NRECV < NSEND) barrier_by_id(0);
-      else barrier_by_id(1);
-    }
+    if (nthreads == WARP_SIZE) __syncwarp();
+    else barrier_by_group();
 #else
-    if (NSEND>NRECV) {
-      asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
-    } else {
-      asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
-    }
+    if (nthreads == WARP_SIZE) __syncwarp();
+    else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
 #endif
  }

  inline __device__ void subBarrier() {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    __syncthreads();
+    barrier();
 #else
-    if (NSEND>NRECV) {
-      asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
-    } else {
-      asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
-    }
+    if (nworkers == nthreads) barrier();
+    else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
 #endif
  }

  uint32_t spins = 0;
  uint32_t abort = 0;

-  inline __device__ int checkAbort(int i, int send) {
+  inline __device__ int checkAbort() {
    spins++;
    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
      abort = LOAD(comm->abortFlag);
@@ -121,90 +113,54 @@ class ncclPrimitives {
    return abort;
  }

-  inline __device__ void waitSend(int nbytes) {
-    spins = 0;
-    if (sendConnHeadPtr) {
-      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
-        sendConnHeadCache = LOAD(sendConnHeadPtr);
-        if (checkAbort(wid, 1)) break;
-      }
-      if (sendConnFifoPtr) {
-        STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
-      }
-      sendConnHead += SLICESTEPS;
-    }
+  template <int DIRECTPTR>
+  inline __device__ T* directPtr(ssize_t directOffset) {
+    return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
  }

-  inline __device__ void waitRecv() {
+  template <int DST, int DIRECTSEND>
+  inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
    spins = 0;
-    if (recvConnTailPtr) {
-#ifdef ENABLE_PROFILING
-      uint64_t t0 = __rtc64();
-#endif
-      while (recvConnTailCache < recvConnTail + SLICESTEPS) {
-        recvConnTailCache = LOAD(recvConnTailPtr);
-        if (checkAbort(wid, 0)) break;
-      }
-#ifdef ENABLE_PROFILING
-      __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
-#endif
-      recvConnTail += SLICESTEPS;
+    while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
+      connHeadCache = LOAD(connHeadPtr);
+      if (checkAbort()) break;
    }
+    if (connSizesFifoPtr) {
+      STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
+    }
+
+    if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
+    else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
+    step += SLICESTEPS;
  }

-  inline __device__ void incRecv(int i) {
-    recvStep[i] += SLICESTEPS;
+  template <int SRC, int DIRECTRECV>
+  inline __device__ void waitRecv(ssize_t directOffset) {
+    spins = 0;
+#ifdef ENABLE_PROFILING
+    uint64_t t0 = __builtin_amdgcn_s_memrealtime();
+#endif
+    while (connTailCache < step + SLICESTEPS) {
+      connTailCache = LOAD(connTailPtr);
+      if (checkAbort()) break;
+    }
+#ifdef ENABLE_PROFILING
+    if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
+#endif
+    if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
+    else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
+    step += SLICESTEPS;
  }
+
  inline __device__ void postRecv() {
-    if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
+    STORE(connHeadPtr, step += SLICESTEPS);
  }

-  inline __device__ void incSend(int i) {
-    sendStep[i] += SLICESTEPS;
-  }
  inline __device__ void postSend() {
-    if (sendConnTailPtr) {
-      if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
-      STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
-    }
+    if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
+    STORE(connTailPtr, step += SLICESTEPS);
  }

-  template <int DIRECTRECV>
-  inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
-#if defined(RCCL_USE_DIRECT_BUFFER)
-    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
-#else
-    return recvPtr(i);
-#endif
-  }
-
-  template <int DIRECTSEND>
-  inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
-#if defined(RCCL_USE_DIRECT_BUFFER)
-    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
-#else
-    return sendPtr(i);
-#endif
-  }
-
-template <int DIRECTRECV>
-inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
-#if defined(RCCL_USE_DIRECT_BUFFER)
-  return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
-#else
-  return sliceInc;
-#endif
-}
-
-template <int DIRECTSEND>
-inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
-#if defined(RCCL_USE_DIRECT_BUFFER)
-  return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
-#else
-  return sliceInc;
-#endif
-}
-
  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
  inline __device__ void
  GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
@@ -212,148 +168,126 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    int sliceSize = stepSize*SLICESTEPS;
    int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);

-    const T* srcs[RECV*NRECV+SRC];
-    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
-    if (RECV) {
-      if (SRC) srcs[1] = recvPtr(0);
-      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
-    }
-
-    T* dsts[SEND*NSEND+DST];
-    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
-    if (SEND) {
-      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
-      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
-    }
-
    #pragma unroll
    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
      int realSize = max(0, min(dataSize, nelem-offset));
 #ifdef ENABLE_PROFILING
-      uint64_t t0 = __rtc64();
+      uint64_t t0 = __builtin_amdgcn_s_memrealtime();
 #endif
-      if (SEND) waitSend(realSize*sizeof(T));
-      if (RECV) waitRecv();
-      if (realSize > 0) {
-        barrier();
+      if (tid < nworkers) {
+        if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
+        if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
+        if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
+        if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
+        if (realSize > 0) {
 #ifdef ENABLE_PROFILING
-        if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
+          if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
 #endif
-#if defined(RCCL_USE_DIRECT_BUFFER)
-        if (DIRECTRECV && recvDirectBuff[0]) {
-          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
-          if (SEND) {
-            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
-          }
-        } else {
-          ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+          subBarrier();
+          ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
        }
-#else
-        ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
-#endif
      }
      barrier();
-      FOR_SEND(incSend);
-      FOR_RECV(incRecv);
-      if (tid >= nthreads-WARP_SIZE) {
-        if (SEND) {
-          if (realSize > 0 && wid == 0) __threadfence_system();
-          __syncwarp();
-          postSend();
-        }
-        if (RECV) postRecv();
-      }
-      srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
-      for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
-      dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
-      for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+      if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
+      __syncwarp();
+      if (SEND && (role & ROLE_POST_SEND)) postSend();
+      if (RECV && (role & ROLE_POST_RECV)) postRecv();
      offset += realSize;
    }
  }

-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
-    recvStep[i] = LOAD(&conn->step);
-    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
-#if defined(RCCL_USE_DIRECT_BUFFER)
-    recvDirectBuff[i] = NULL;
-    if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
-      recvDirectBuff[i] = directBuff;
-      if (tid == 0) STORE(conn->ptrExchange, directBuff);
-    }
-#endif
-    if (wid == i) recvConn = conn;
-    if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
-    nrecv++;
-  }
-
-  __device__ __forceinline__ void loadRecvSync() {
-    if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
-      recvConnTailPtr = LOAD(&recvConn->tail);
-      recvConnTailCache = LOAD(recvConnTailPtr);
-    }
-    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
-      recvConnHeadPtr = LOAD(&recvConn->head);
-      // Return credits in case we rounded up.
-      STORE(recvConnHeadPtr, recvConnHead);
+  __device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
+    if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
+      conn = &channel->devPeers[peer].recv.conn;
+      step = conn->step;
+      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+      if (role & ROLE_POST_RECV) {
+        connHeadPtr = conn->head;
+        // Return credits in case we rounded up.
+        STORE(connHeadPtr, step);
+      }
+      if (role & ROLE_WAIT_RECV) {
+        buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        //if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+        //  direct = directBuff;
+        //  *conn->ptrExchange = directBuff;
+        //}
+        connTailPtr = conn->tail;
+        connTailCache = LOAD(connTailPtr);
+        connPtrsFifoPtr = conn->ptrsFifo;
+      }
    }
  }

-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
-    sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
-    sendStep[i] = LOAD(&conn->step);
-    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
-#if defined(RCCL_USE_DIRECT_BUFFER)
-    sendDirectBuff[i] = NULL;
-    if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
-      void* volatile* ptr = LOAD(&conn->ptrExchange);
-      while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
-      barrier();
-      if (tid == 0) STORE(ptr, NULL);
-    }
-#endif
-    if (wid == i) sendConn = conn;
-    if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
-    nsend++;
-  }
-  __device__ __forceinline__ void loadSendSync() {
-    if (tid < nsend) {
-      sendConnHeadPtr = LOAD(&sendConn->head);
-      sendConnHeadCache = LOAD(sendConnHeadPtr);
-      sendConnFifoPtr = LOAD(&sendConn->fifo);
-    }
-    if (tid >= nthreads-WARP_SIZE && wid < nsend) {
-      sendConnTailPtr = LOAD(&sendConn->tail);
+  __device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
+    if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
+      conn = &channel->devPeers[peer].send.conn;
+      step = conn->step;
+      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+      if (role & ROLE_POST_SEND) {
+        connTailPtr = conn->tail;
+      }
+      if (role & ROLE_WAIT_SEND) {
+        buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        //if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+        //  void* volatile* ptr = conn->ptrExchange;
+        //  while ((direct = (T*)(*ptr)) == NULL);
+        //  *ptr = NULL;
+        //}
+        connHeadPtr = conn->head;
+        connHeadCache = LOAD(connHeadPtr);
+        connSizesFifoPtr = conn->sizesFifo;
+        connPtrsFifoPtr = conn->ptrsFifo;
+      }
    }
  }

-  __device__ __forceinline__ void saveRecvSync() {
-    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
-      STORE(&recvConn->step, recvConnHead);
-      __threadfence_system();
-    }
-  }
-
-  __device__ __forceinline__ void saveSendSync() {
-    if (tid < nsend) {
-      STORE(&sendConn->step, sendConnHead);
+  __device__ __forceinline__ void saveSync() {
+    if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
+      conn->step = step;
      __threadfence_system();
    }
  }

 public:
  __device__ __forceinline__
-  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
-    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
-    barriers = channel->barrier;
-    barrier_next = channel->barrier_next;
+  ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
+    : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
+    nthreads = nworkers;
+    // For send operations, we need an extra warp to overlap the threadfence and the copy
+    // int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
+    // nthreads += postThreads;
+
    // Make sure step is updated before we read it.
    barrier();

-    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
-    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
-    loadRecvSync();
-    loadSendSync();
+    for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
+    for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
+
+    #define SYNC_GROUP 8
+    static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
+
+    int g = tid / SYNC_GROUP;
+    int ng = nthreads / SYNC_GROUP;
+    index = tid % SYNC_GROUP;
+
+    if (g == 0) {
+      if (index < nrecv) role |= ROLE_WAIT_RECV;
+      if (index == nrecv) role |= ROLE_SRC;
+    } else if (g == 1) {
+      if (index < nsend) role |= ROLE_WAIT_SEND;
+      if (index == nsend) role |= ROLE_DST;
+    } else if (g == ng - 2) {
+      if (index < nrecv) role |= ROLE_POST_RECV;
+    } else if (g == ng - 1) {
+      if (index < nsend) role |= ROLE_POST_SEND;
+    }
+
+    if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
+    if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
+
+    loadRecvConn(channel, directBuff);
+    loadSendConn(channel);
  }

  __device__ __forceinline__ void
@@ -414,8 +348,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {

  __device__ __forceinline__ ~ncclPrimitives() {
    // Save steps for the next operation
-    saveRecvSync();
-    saveSendSync();
+    saveSync();
  }
 };

@@ -424,10 +357,10 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {

 #ifdef ENABLE_PROFILING
 #define INIT_COUNTER \
-  if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
+  if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }

 #define ACCUMULATE_COUNTER(prim) \
-  if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \
+  if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
    + ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
    __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
 #else
@@ -205,7 +205,7 @@ class ncclLLPrimitives {
      sendConnHeadPtr = LOAD(&sendConn->head);
      sendConnHeadCache = LOAD(sendConnHeadPtr);
      sendConnHead = LOAD(&sendConn->step);
-      sendConnFifoPtr = LOAD(&sendConn->fifo);
+      sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
    }
  }

@@ -118,9 +118,14 @@ class ncclLL128Primitives {
    #pragma unroll
    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
      if (u*WARP_SIZE < maxOffset) {
-        uint64_t v0, v1;
-        load128(src64Ptr+u*WARP_SIZE, v0, v1);
-        storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
+        using Vec = uint64_t __attribute__((ext_vector_type(2)));
+        Vec i2;
+        //load128(src64Ptr+u*WARP_SIZE, v0, v1);
+        asm volatile ("flat_load_dwordx4 %0, %1\n"
+          "s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(src64Ptr+u*WARP_SIZE));
+        //storeShmem128(shmemAsmPtr+u*WARP_SIZE, i2[0], i2[1]);
+        *(shmemAsmPtr+u*WARP_SIZE) = i2[0];
+        *(shmemAsmPtr+u*WARP_SIZE+1) = i2[1];
      }
    }
 #endif
@@ -135,15 +140,24 @@ class ncclLL128Primitives {

  template <int ELEMS_PER_THREAD>
  inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
-    uint64_t v[ELEMS_PER_THREAD];
+    using Velem = uint64_t __attribute__((ext_vector_type(ELEMS_PER_THREAD)));
+    Velem v;
    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
    #pragma unroll
    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-      loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+      v[u] = *(shmemAsmPtr+u*WARP_SIZE);
+      v[u+1] = *(shmemAsmPtr+u*WARP_SIZE+1);
+      //loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
    }
    #pragma unroll
    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-      if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+      //if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+      using Vec = uint64_t __attribute__((ext_vector_type(2)));
+      Vec i2;
+      i2[0] = v[u];
+      i2[1] = v[u+1];//
+      if (u*WARP_SIZE < maxOffset) asm volatile ("flat_store_dwordx4 %0, %1\n"
+        "s_waitcnt vmcnt(0)\n" : : "v"(dst64Ptr+u*WARP_SIZE), "v"(i2));
    }
  }

@@ -176,45 +190,52 @@ class ncclLL128Primitives {
      uint64_t flag = recvFlag(0);
      uint64_t* ptr = recvPtr(0)+ll128Offset;
      bool needReload;
-      uint64_t v0, v1;
+      using Vec = uint64_t __attribute__((ext_vector_type(2)));
+      Vec i2;
      do {
        if (wid == 0) STORE(sync, 0);
        needReload = false;
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          load128(ptr+u*WARP_SIZE, v0, v1);
-          needReload |= flagThread && (v1 != flag);
+          asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
+            "s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
+          //load128(ptr+u*WARP_SIZE, v0, v1);
+          needReload |= flagThread && (i2[1] != flag);
        }
        if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
-        if (LOAD(sync) == 0) break;
-      } while (checkAbort(0, 0) == 0);
+      } while (LOAD(sync) && checkAbort(0, 0) == 0);
      #pragma unroll
      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-        load128(ptr+u*WARP_SIZE, v0, v1);
-        v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
-        v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
+        asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
+          "s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
+        //load128(ptr+u*WARP_SIZE, v0, v1);
+        v[u] = SRC ? MULTI<FUNC, T>()(i2[0], v[u]) : i2[0];
+        v[u+1] = SRC ? MULTI<FUNC, T>()(i2[1], v[u+1]) : i2[1];
      }

      for (int i=1; i<NRECV && i<nrecv; i++) {
        uint64_t flag = recvFlag(i);
        uint64_t* ptr = recvPtr(i)+ll128Offset;
-        uint64_t v0, v1;
+        Vec i2;
        do {
          if (wid == 0) STORE(sync, 0);
-          needReload = false;
+          needReload = 0;
          #pragma unroll
          for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-            load128(ptr+u*WARP_SIZE, v0, v1);
-            needReload |= flagThread && (v1 != flag);
+            asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
+              "s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
+            //load128(ptr+u*WARP_SIZE, v0, v1);
+            needReload |= flagThread && (i2[1] != flag);
          }
          if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
-          if (LOAD(sync) == 0) break;
-        } while (checkAbort(i, 0) == 0);
+        } while (LOAD(sync) && checkAbort(i, 0) == 0);
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          load128(ptr+u*WARP_SIZE, v0, v1);
-          v[u] = MULTI<FUNC, T>()(v0, v[u]);
-          v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
+          asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
+            "s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
+          //load128(ptr+u*WARP_SIZE, v0, v1);
+          v[u] = MULTI<FUNC, T>()(i2[0], v[u]);
+          v[u+1] = MULTI<FUNC, T>()(i2[1], v[u+1]);
        }
      }
    }
@@ -223,18 +244,30 @@ class ncclLL128Primitives {
    /************************ Send **************************/
    if (SEND) {
      for (int i=1; i<NSEND && i<nsend; i++) {
-        int flag = sendFlag(i);
+        uint64_t flag = sendFlag(i);
        uint64_t* ptr = sendPtr(i)+ll128Offset;
        #pragma unroll
        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-          store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+          //store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+          using Vec = uint64_t __attribute__((ext_vector_type(2)));
+          Vec i2;
+          i2[0] = v[u];
+          i2[1] = flagThread ? flag : v[u+1];//
+          asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
+            "s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
        }
      }
-      int flag = sendFlag(0);
+      uint64_t flag = sendFlag(0);
      uint64_t* ptr = sendPtr(0)+ll128Offset;
      #pragma unroll
      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
-        store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+        //store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+        using Vec = uint64_t __attribute__((ext_vector_type(2)));
+        Vec i2;
+        i2[0] = v[u];
+        i2[1] = flagThread ? flag : v[u+1];//
+        asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
+          "s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
      }
    }
    /********************** End Send ************************/
@@ -279,7 +312,7 @@ class ncclLL128Primitives {
      const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
      if (SRC) {
        int done = 0;
-        if ((((uint64_t)srcPtr)&0xf) == 0) {
+        if ((((uint64_t)srcPtr)&0x3) == 0) {
          loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
        }
@@ -290,7 +323,7 @@ class ncclLL128Primitives {
      __syncwarp();
      if (DST) {
        int done = 0;
-        if ((((uint64_t)dstPtr)&0xf) == 0) {
+        if ((((uint64_t)dstPtr)&0x3) == 0) {
          storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
        }
@@ -330,10 +363,10 @@ class ncclLL128Primitives {
      sendConnHeadPtr = LOAD(&sendConn->head);
      sendConnHeadCache = LOAD(sendConnHeadPtr);
      sendConnHead = LOAD(&sendConn->step);
-      sendConnFifoPtr = LOAD(&sendConn->fifo);
+      sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
    }
    if (tid >= nthreads-WARP_SIZE && wid<nsend) {
-      if (sendConn->fifo) {
+      if (sendConn->sizesFifo) {
        sendConnTailPtr = LOAD(&sendConn->tail);
        sendConnTail = LOAD(&sendConn->step);
      }
@@ -357,12 +390,7 @@ class ncclLL128Primitives {
 public:
  __device__ __forceinline__
  ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
-    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
-    // for __any_sync
-    if (NSEND > NRECV)
-      sync = channel->sync + 2 + tid/WARP_SIZE;
-    else
-      sync = channel->sync + tid/WARP_SIZE;
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid), sync(ncclShmem->sync+warp) {
    // Make sure step is updated before we read it.
    barrier();

@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclReduce, ncclCollReduce);
+IMPL_COLL_R(Reduce);
@@ -9,151 +9,145 @@
 #include "primitives.h"
 #include "collectives.h"

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*realChunkSize;
-    int nelem = min(realChunkSize, size-offset);
-    if (prevRank == root) {
-      prims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      prims.recvReduceSend(thisInput+offset, nelem);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
+        if (prevRank == root) {
+          prims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
    }
-  }
-}
+};

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = comm->rank;
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = comm->rank;
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t offset = gridOffset + bid*chunkSize;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (prevRank == root) {
+          LLprims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
    }
-    ssize_t offset = gridOffset + bid*chunkSize;
-
-    int nelem = min(chunkSize, size-offset);
-    if (prevRank == root) {
-      LLprims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
+};

 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = comm->rank;
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = comm->rank;
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;

-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
-    ssize_t offset = gridOffset + bid*chunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+        ssize_t offset = gridOffset + bid*chunkSize;

-    int nelem = min(chunkSize, size-offset);
-    if (prevRank == root) {
-      LLprims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      LLprims.recvReduceSend(thisInput+offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (prevRank == root) {
+          LLprims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
    }
-  }
-}
+};

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
+IMPL_COLL_R(ReduceScatter);
@@ -9,195 +9,189 @@
 #include "primitives.h"
 #include "collectives.h"

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + bid*realChunkSize;

-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(realChunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(realChunkSize, size-chunkOffset);
+        int rankDest;

-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;

-    prims.send(thisInput+offset, nelem);
+        prims.send(thisInput+offset, nelem);

-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;

-      prims.recvReduceSend(thisInput+offset, nelem);
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final result
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
    }
+};

-    // step k-1: reduce this buffer and data, which will produce the final result
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;

-    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;

-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;

-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+        LLprims.send(thisInput+offset, nelem);

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final
+        // result that we store in this data
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
    }
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
+};

 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;

-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);

-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;

-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);

-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;

-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;

-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;

-    LLprims.send(thisInput+offset, nelem);
+        LLprims.send(thisInput+offset, nelem);

-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;

-      LLprims.recvReduceSend(thisInput+offset, nelem);
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final
+        // result that we store in this data
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
    }
+};

-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};

-    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+};
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,5 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
+IMPL_COLL_P(SendRecv);
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,74 +8,85 @@
 #include "primitives.h"
 #include "collectives.h"

-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->p2p.nThreads;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ __attribute__((noinline)) void run(struct ncclWorkElem* firstArgs) {
+      struct ncclWorkElem* args = firstArgs;
+      int tid = threadIdx.x;
+      int group = 0;
+      for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
+        int nThreadsSegment = args->p2p.nThreads;
+        if (nThreadsSegment == 0) return; // Nothing else to do
+        int groupRecv = group;
+        group += 1;
+        int groupSend = group;
+        group += 1;
+        if (tid < nThreadsSegment) {
+          const int nThreads = nThreadsSegment;

-  // Compute pointers
-  const T* sendbuff = (const T*)args->sendbuff;
-  T* recvbuff = (T*)args->recvbuff;
+          // Compute pointers
+          const T* sendbuff = (const T*)args->sendbuff;
+          T* recvbuff = (T*)args->recvbuff;
+          const ssize_t sendCount = args->p2p.sendCount;
+          const ssize_t recvCount = args->p2p.recvCount;

-  if (args->p2p.delta < 0 ) return; // No-op
+          const int delta = args->p2p.delta;
+          if (delta == 0) {
+            if (tid < nThreads && sendbuff != recvbuff) {
+              // local copy : ReduceOrCopyMulti takes an int as number of elements,
+              // so we split it in blocks of 1G elements.
+              int blockSize = 1<<30;
+              for (size_t offset=0; offset<sendCount; offset += blockSize) {
+                size_t remaining = sendCount - offset;
+                if (remaining < blockSize) blockSize = remaining;
+                ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
+                sendbuff += blockSize; recvbuff += blockSize;
+              }
+            }
+          } else {
+            struct ncclDevComm* comm = args->comm;
+            struct ncclChannel* channel = comm->channels+blockIdx.x;

-  if (args->p2p.delta == 0) {
-    if (tid < nthreads && sendbuff != recvbuff) {
-      // local copy : ReduceOrCopyMulti takes an int as number of elements,
-      // so we split it in blocks of 1G elements.
-      int blockSize = 1<<30;
-      for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
-        size_t remaining = args->p2p.sendCount - offset;
-        if (remaining < blockSize) blockSize = remaining;
-        ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
-        sendbuff += blockSize; recvbuff += blockSize;
+            const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
+            const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
+
+            int nThreadsSplit = nThreads/2;
+            if ((tid < nThreadsSplit) && recvCount >= 0) {
+              int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
+              int nt = nThreadsSplit;
+              ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
+                prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
+
+              if (recvCount == 0) {
+                prims.recv(recvbuff, 0);
+              } else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
+                int realChunkSize = min(chunkSize, recvCount-offset);
+                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+                int nelem = min(realChunkSize, recvCount-offset);
+                prims.directRecv(recvbuff+offset, offset, nelem);
+              }
+            }
+            if ((tid >= nThreadsSplit) && sendCount >= 0) {
+              int peer = (comm->rank+delta)%comm->nRanks;
+              int nt = nThreads-nThreadsSplit;
+              ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
+                prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
+
+              if (sendCount == 0) {
+                prims.send(sendbuff, 0);
+              } else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
+                int realChunkSize = min(chunkSize, sendCount-offset);
+                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+                int nelem = min(realChunkSize, sendCount-offset);
+                prims.directSend(sendbuff+offset, offset, nelem);
+              }
+            }
+          }
+        }
+        tid -= nThreadsSegment;
+        if (tid < 0) return;
+        args++;
      }
    }
-    return;
-  }
-
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
-
-  int nthreadsSplit = nthreads/2;
-  // We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
-  // receive threads, but then we define all peers to -1 since sender threads don't receive and
-  // receive threads don't send.
-  int peerNone[2] = {-1,-1};
-
-  if (tid < nthreadsSplit ) {
-    const ssize_t sendSize = args->p2p.sendCount;
-    if (sendSize < 0) return;
-
-    int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
-    ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
-      prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
-
-    if (sendSize == 0) {
-      prims.send(sendbuff, 0);
-    } else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
-      int realChunkSize = min(stepSize, sendSize-offset);
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      int nelem = min(realChunkSize, sendSize-offset);
-      prims.directSend(sendbuff+offset, offset, nelem);
-    }
-  } else {
-    const ssize_t recvSize = args->p2p.recvCount;
-    if (recvSize < 0) return;
-
-    int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
-    ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
-      prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
-
-    if (recvSize == 0) {
-      prims.recv(recvbuff, 0);
-    } else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
-      int realChunkSize = min(stepSize, recvSize-offset);
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      int nelem = min(realChunkSize, recvSize-offset);
-      prims.directRecv(recvbuff+offset, offset, nelem);
-    }
-  }
-}
+};
@@ -29,9 +29,10 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
    return ncclSuccess;
  }
  else {
-    struct ncclInfo info = { ncclCollGather, "Gather",
-      sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
-      GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
-    return ncclEnqueueCheck(&info);
+    //struct ncclInfo info = { ncclCollGather, "Gather",
+    //  sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
+    //  GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
+    //return ncclEnqueueCheck(&info);
+    return ncclInternalError;
  }
 }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollReduce, "Reduce",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
  return ncclEnqueueCheck(&info);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
  return ncclEnqueueCheck(&info);
@@ -29,9 +29,10 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
    return ncclSuccess;
  }
  else {
-    struct ncclInfo info = { ncclCollScatter, "Scatter",
-      sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
-      SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
-    return ncclEnqueueCheck(&info);
+    //struct ncclInfo info = { ncclCollScatter, "Scatter",
+    //  sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
+    //  SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
+    //return ncclEnqueueCheck(&info);
+    return ncclInternalError;
  }
 }
@@ -13,7 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollSendRecv, "Send",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncSendRecv, "Send",
    sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
    1, 1 };
  ncclResult_t ret;
@@ -27,7 +28,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, hipStream_t stream) {
-  struct ncclInfo info = { ncclCollSendRecv, "Recv",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncSendRecv, "Recv",
    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
    1, 1 };
  ncclResult_t ret;
@@ -128,7 +128,7 @@ void ncclDebugInit() {
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
  if (ncclDebugLevel == -1) ncclDebugInit();
  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
-  if (ncclDebugLevel < level) return;
+  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;

  // Gather the rank information. This can take > 1us so we want to make sure
  // we only do it when needed.
@@ -145,11 +145,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  if (level == NCCL_LOG_WARN)
    len = snprintf(buffer, sizeof(buffer),
        "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
+  else if (level == NCCL_LOG_INFO)
    len = snprintf(buffer, sizeof(buffer),
        "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
 #ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+  else if (level == NCCL_LOG_TRACE) {
    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
    len = snprintf(buffer, sizeof(buffer),
@@ -8,59 +8,58 @@
 #include "enqueue.h"
 #include "argcheck.h"
 #include "coll_net.h"
-#include "../graph/topo.h"

 // Only generate inline kernels for LL
-#define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_KERN_NAME(coll##LL, op, dtype), \
-  NCCL_KERN_NAME(coll##LL, op, dtype), \
-  NCCL_KERN_NAME(coll##LL, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, dtype) \
+  NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
+  NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
+  NCCL_KERN_NAME(func, algo, LL, redop, dtype)

-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##CollNet, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+  NCCL_FUNC5(func, TREE,    redop, type), \
+  NCCL_FUNC5(func, RING,    redop, type), \
+  NCCL_FUNC5(func, COLLNET, redop, type)

 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64), \
-  NCCL_FUNC4(coll, op, b16)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
+#define NCCL_FUNCS3A(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, uint8_t), \
+  NCCL_FUNC4(func, redop, int32_t), \
+  NCCL_FUNC4(func, redop, uint32_t), \
+  NCCL_FUNC4(func, redop, int64_t), \
+  NCCL_FUNC4(func, redop, uint64_t), \
+  NCCL_FUNC4(func, redop, half), \
+  NCCL_FUNC4(func, redop, float), \
+  NCCL_FUNC4(func, redop, double), \
+  NCCL_FUNC4(func, redop, rccl_bfloat16)
+#define NCCL_FUNCS3B(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t)

 // Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum)
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum)
+#define NCCL_FUNCS2B(func) \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum)

-typedef void(*ncclKern_t)(struct ncclDevComm*);
+typedef void(*ncclKern_t)(struct ncclWorkElem first);
 // Must be consistent with the ncclFuncSet enum
 static ncclKern_t const ncclKerns[1] = {
-  NCCL_KERN_NAME(ncclSendRecv, copy, i8)
+  NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
 };

 /*****************************************************************************/
@@ -70,12 +69,8 @@ static ncclKern_t const ncclKerns[1] = {
 ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
  if (cgMode & 0x01) {
    CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
-            // These flags are to reduce the latency of using this API
-#if __HIP__
-            hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
-#else
-            0));
-#endif
+      // These flags are to reduce the latency of using this API
+      hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
    return ncclSuccess;
  }
  int savedDev;
@@ -83,44 +78,62 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
  for (int i = 0; i < numDevices; i++) {
    hipLaunchParams* params = paramsList+i;
    CUDACHECK(hipSetDevice(cudaDevs[i]));
-    hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
+    hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
  }
  CUDACHECK(hipSetDevice(savedDev));
  return ncclSuccess;
 }

-ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
+static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
+  if (channel->workCount == NCCL_MAX_OPS) {
+    WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
+    return ncclInvalidUsage;
+  }
+  int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
+  struct ncclWork* w = channel->workFifo+opIndex;
+  struct ncclWorkElem* e = w->elems;
+  volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
+  while (LOAD(activePtr) != 0) sched_yield();
+  memset(w, 0, sizeof(struct ncclWork));
+  // Initialize with work elem if provided
+  if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
+  STORE(&e->active, 1);
+  e->index = opIndex;
+  channel->workFifoTail++;
+  channel->workCount++;
+  if (work) *work = w;
+  return ncclSuccess;
+}
+
+static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
  // Only launch blocks where we have work to do.
-  for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
-    if (comm->channels[c].collCount) params->gridDim.x = c+1;
+  for (int c=0; c<comm->p2pnChannels; c++) {
+    if (comm->channels[c].workCount) params->gridDim.x = c+1;
  }

  // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
  for (int c=0; c<params->gridDim.x; c++) {
    struct ncclChannel* channel = comm->channels+c;
-    if (channel->collCount == 0) {
-      int opIndex = channel->collFifoTail;
-      struct ncclColl* c = channel->collectives+opIndex;
-      volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-      while (activePtr[0] != 0) sched_yield();
-
-      c->args.p2p.delta = -1; // no-op
-      c->funcIndex = FUNC_INDEX_P2P;
-      c->args.comm = comm->devComm;
-      c->active = 1;
-      opIndex = (opIndex+1)%NCCL_MAX_OPS;
-      c->nextIndex = opIndex;
-      channel->collFifoTail = opIndex;
-      channel->collCount++;
+    if (channel->workCount == 0) {
+      struct ncclWork* w;
+      NCCLCHECK(getNextOp(channel, &w, NULL));
+      struct ncclWorkElem* e = w->elems;
+      e->comm = comm->devComm;
+      e->funcIndex = FUNC_INDEX_P2P;
+      e->p2p.nThreads = 0;
    }
-    STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
+    STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
  }

  // Find the first operation, choose the kernel accordingly and pass it
  // as the first argument.
-  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+  struct ncclChannel* c0 = comm->channels;
+  struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
+  struct ncclWorkElem* elem = work->elems;
+  memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
+  // As we inline the first coll directly, we can free it immediately.
+  if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;

-  comm->args = comm->devComm;
  params->func = (void *)ncclKerns[0];
  return ncclSuccess;
 }
@@ -131,7 +144,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
  bool done = false;
  while (done == false) {
    if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many collectives");
+      WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
      return ncclInvalidUsage;
    }
    if (val+1 == comm->intraRanks) {
@@ -151,7 +164,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
  int val = LOAD(ptr);
  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many collectives");
+    WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
    return ncclInternalError;
  }
  return ncclSuccess;
@@ -212,7 +225,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {


  if (comm->launchMode == ncclComm::PARALLEL) {
-    hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
+    hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
  } else {
    NCCLCHECK(ncclCpuBarrierOut(comm));
  }
@@ -222,13 +235,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
  // launch and the ncclProxyStart call could cause a deadlock.
  // Also, starting the proxies after the CUDA launch seems to be better for
  // performance (latency).
+  uint64_t max = 0ULL;
  for (int r=0; r<params->gridDim.x; r++) {
    struct ncclChannel* channel = comm->channels+r;
-    channel->collStart = channel->collFifoTail;
-    channel->collCount = 0;
+    max = std::max(max, channel->workFifoTail);
+    channel->workCount = 0;
+  }
+  for (int r=0; r<comm->p2pnChannels; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->workFifoTail = max;
  }
  params->gridDim.x = params->blockDim.x = 0;
-  comm->lastOpCount = comm->opCount;
+  comm->lastOpCount = max;
  NCCLCHECK(ncclProxyStart(comm));
  return ncclSuccess;
 }
@@ -273,10 +291,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
      }
    }
  }
-  if (info->coll == ncclCollAllToAll || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv) {
-    info->algorithm = NCCL_ALGO_RING;
-    info->protocol = NCCL_PROTO_SIMPLE;
-  }
  if (info->algorithm == -1 || info->protocol == -1) {
    WARN("Error : no algorithm/protocol available");
    return ncclInternalError;
@@ -284,16 +298,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
  //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);

-  int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
-  if (info->comm->topo->type == RCCL_TOPO_4P2H_ROME && (info->coll == ncclCollAllToAll ||
-    info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv))
-    nc = 2;
+  int nc = (info->nChannels > 0) ? info->nChannels :
+           (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
  int nt = comm->maxThreads[info->algorithm][info->protocol];
  int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
  while (info->nBytes < nc*nt*threadThreshold) {
-    // do not reduce channels in case of alltoall
-    if (info->algorithm != NCCL_ALGO_COLLNET && info->coll != ncclCollAllToAll &&
-      info->coll != ncclCollGather && info->coll != ncclCollScatter && info->coll != ncclCollAllToAllv && nc >= 2) nc--;
+    if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    // do not reduce threads count on VEGA
 #else
@@ -303,7 +313,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
  }
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
-  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
+  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync 
+  if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
 #endif
  info->nChannels = nc;
  info->nThreads = nt;
@@ -312,20 +323,15 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {

 static ncclResult_t getPatternInfo(struct ncclInfo* info) {
  switch (info->coll) {
-    case ncclCollBroadcast:
+    case ncclFuncBroadcast:
      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
-    case ncclCollReduce:
+    case ncclFuncReduce:
      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
-    case ncclCollReduceScatter:
-    case ncclCollAllGather:
+    case ncclFuncReduceScatter:
+    case ncclFuncAllGather:
      info->pattern = ncclPatternRing; break;
-    case ncclCollAllReduce:
+    case ncclFuncAllReduce:
      info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
-    case ncclCollGather:
-    case ncclCollScatter:
-    case ncclCollAllToAll:
-    case ncclCollAllToAllv:
-      info->pattern = ncclPatternAll; break;
    default:
      WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
      return ncclInternalError;
@@ -342,8 +348,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
    case ncclPatternPipelineTo:
    case ncclPatternCollTreeUp:
    case ncclPatternCollTreeDown:
-    case ncclPatternAll:
-      info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
+      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
    case ncclPatternRing:
      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
    case ncclPatternRingTwice:
@@ -355,41 +360,23 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
  return ncclSuccess;
 }

-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
-  coll->args.sendbuff = info->sendbuff;
-  coll->args.recvbuff = info->recvbuff;
-  coll->args.comm = info->comm->devComm;
-  coll->args.opCount = info->comm->opCount;
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
+  work->comm = info->comm->devComm;

-  if (info->coll == ncclCollSendRecv) {
-    coll->args.p2p.sendCount = info->sendbytes;
-    coll->args.p2p.recvCount = info->recvbytes;
-    coll->args.p2p.delta = info->delta;
-    coll->funcIndex = FUNC_INDEX_P2P;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
-#else
-    coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
-#endif
-    return ncclSuccess;
-  }
  // Set nstepsPerLoop and nchunksPerLoop
  NCCLCHECK(getAlgoInfo(info));
  NCCLCHECK(getPatternInfo(info));
  NCCLCHECK(getLoopInfo(info));

-  if (info->coll == ncclCollAllToAllv)  {
-    coll->args.a2av.count = info->count;
-    coll->args.a2av.nChannels = info->nChannels;
-    coll->args.a2av.nThreads = info->nThreads;
-  } else {
-    coll->args.coll.root = info->root;
-    coll->args.coll.count = info->count;
-    coll->args.coll.nChannels = info->nChannels;
-    coll->args.coll.nThreads = info->nThreads;
-  }
+  work->opCount = info->comm->opCount;
+  work->sendbuff = info->sendbuff;
+  work->recvbuff = info->recvbuff;
+  work->coll.root = info->root;
+  work->coll.count = info->count;
+  work->coll.nChannels = info->nChannels;
+  work->nThreads = info->nThreads;

-  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+  work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);

  int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -400,25 +387,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
  if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
    if (info->pattern == ncclPatternTreeUpDown) {
      // Optimize chunkSize / nSteps
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
    }
    // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
    // Optimize chunkSize / nSteps
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
    // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->protocol == NCCL_PROTO_LL) {
    const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
    const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
-    coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
+    work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
  } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
    int nNodes = info->comm->nNodes;
    float ppn = info->comm->nRanks / (float)nNodes;
@@ -426,7 +413,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
    while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
    while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
    // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+    work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
  }

  // Compute nSteps for proxies
@@ -434,20 +421,20 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
  if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
  if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
  //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
-  int nLoops;
-  if (info->pattern != ncclPatternAll)
-    nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
-  else
-    nLoops = (int)(DIVUP(info->nBytes, (((size_t)((info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1))))*info->comm->nRanks*info->nchunksPerLoop*chunkEffectiveSize));
+  int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
  proxyArgs->sliceSteps = sliceSteps;
  proxyArgs->chunkSteps = chunkSteps;
  proxyArgs->protocol = info->protocol;
-  proxyArgs->opCount = info->comm->opCount;
  proxyArgs->dtype = info->datatype;
  proxyArgs->redOp = info->op;
-  if (info->coll != ncclCollAllToAllv) TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkEffectiveSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
+  // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
+  // because some protocols need to transmit more than the total size, plus they sometimes
+  // round up
+  proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
+
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
      nLoops, proxyArgs->nsteps, info->comm);
  return ncclSuccess;
 }
@@ -464,32 +451,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
 }

 ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
-  if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
+  if (info->comm->nRanks == 1) {
    if (info->sendbuff != info->recvbuff)
      CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
    return ncclSuccess;
  }

-  struct ncclColl coll;
+  struct ncclWorkElem work;
  struct ncclProxyArgs proxyArgs;
  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
-  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+  NCCLCHECK(computeColl(info, &work, &proxyArgs));

  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);

-  int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
+  int nChannels = work.coll.nChannels;
  int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;

  for (int bid=0; bid<nChannels*nSubChannels; bid++) {
-    int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
-      info->comm->myParams->gridDim.x % info->comm->nChannels;
+    int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
    struct ncclChannel* channel = info->comm->channels+channelId;

-    if (channel->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
    // Proxy
    proxyArgs.channel = channel;
    // Adjust pattern for CollNet based on channel index
@@ -497,77 +478,143 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
      info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
    }

-    if (info->coll == ncclCollSendRecv) {
-      info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
-      NCCLCHECK(ncclProxySaveP2p(info, channel));
-    } else if (info->coll == ncclCollAllToAll || info->coll == ncclCollScatter || info->coll == ncclCollGather || info->coll == ncclCollAllToAllv)  {
-      NCCLCHECK(ncclProxySaveA2a(&proxyArgs, info));
-    } else {
-      NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
-    }
+    if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
    info->comm->myParams->gridDim.x++;
-    int opIndex = channel->collFifoTail;
-    struct ncclColl* c = channel->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (LOAD(activePtr) != 0) sched_yield();
-
-    memcpy(c, &coll, sizeof(struct ncclColl));
-    if (info->coll == ncclCollAllToAllv) {
-      c->args.a2av.extra = channel->collectivesExtra + info->comm->nRanks*4*opIndex;
-      memcpy(c->args.a2av.extra, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(c->args.a2av.extra+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
-      c->args.a2av.bid = bid % coll.args.coll.nChannels;
-    } else if (info->coll != ncclCollSendRecv)
-      c->args.coll.bid = bid % coll.args.coll.nChannels;
-
-    STORE(&c->active, 1);
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    channel->collFifoTail = opIndex;
-    channel->collCount++;
+    work.coll.bid = bid % nChannels;
+    NCCLCHECK(getNextOp(channel, NULL, &work));
  }
  info->comm->opCount++;
  return ncclSuccess;
 }

-// Save p2p operations in comm->p2plist. Operations will be posted to channels
+#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
+#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
+
+ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
+  if (comm->asyncOpCount == 0) {
+    return ncclSuccess;
+  } else if (comm->asyncOpCount == 1) {
+    // No aggregation
+    struct ncclInfo* info = comm->asyncOps;
+    info->nChannels = 0;
+    NCCLCHECK(ncclSaveKernel(info));
+  } else {
+    // Aggregation
+    size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks;  // scale channel size based on nranks as latency increases
+    // Reduce the per-channel size if we cannot fully utilize the channels
+    while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
+    for (int c = 0; c < comm->asyncOpCount; c++) {
+      struct ncclInfo* info = comm->asyncOps+c;
+      info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
+      NCCLCHECK(ncclSaveKernel(info));
+    }
+  }
+  // Reset counters
+  comm->asyncOpCount = 0;
+  comm->asyncTotalSize = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
+  ncclComm_t comm = info->comm;
+  if (comm->asyncOpCount >= NCCL_MAX_OPS) {
+    WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
+    return ncclInvalidUsage;
+  }
+  memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
+  comm->asyncOpCount++;
+  comm->asyncTotalSize += info->nBytes;
+  return ncclSuccess;
+}
+
+// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
 // during ncclGroupEnd()
-ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
+static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
  struct ncclComm* comm = info->comm;
-  struct ncclP2Plist* p2plist = &comm->p2plist;
  int peer = info->root;
-  p2plist->count++;
  ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  if (info->recvbuff == NULL) {
+  if (info->opName[0] == 'S') { // Send
    if (peer != comm->rank) {
      int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
        if (comm->channels[channelId].peers[peer].send.connected == 0) {
-          p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
+          comm->connectSend[peer] |= (1<<channelId);
+          comm->connect = 1;
        }
      }
    }
-    p2plist->peerlist[info->root].sendbytes = nBytes;
-    p2plist->peerlist[info->root].sendbuff = info->sendbuff;
+    NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
+    comm->p2pSendCount++;
  } else {
    if (peer != comm->rank) {
      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
        if (comm->channels[channelId].peers[peer].recv.connected == 0) {
-          p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
+          comm->connectRecv[peer] |= (1<<channelId);
+          comm->connect = 1;
        }
      }
    }
-    p2plist->peerlist[info->root].recvbytes = nBytes;
-    p2plist->peerlist[info->root].recvbuff = info->recvbuff;
+    NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
+    comm->p2pRecvCount++;
  }
  return ncclSuccess;
 }

+static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
+  for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
+    if (work->elems[s].p2p.nThreads == 0) return s;
+  }
+  return -1;
+}
+
+static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
+  struct ncclWorkElem* elem = work->elems+s;
+  elem->comm = info->comm->devComm;
+  elem->funcIndex = FUNC_INDEX_P2P;
+  elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
+  elem->sendbuff = info->sendbuff;
+  elem->recvbuff = info->recvbuff;
+  elem->opCount = info->comm->lastOpCount;
+  elem->p2p.sendCount = info->sendbytes;
+  elem->p2p.recvCount = info->recvbytes;
+  elem->p2p.delta = info->delta;
+  const int nsegments = s+1;
+  int nThreads = 512;
+  while (nsegments*nThreads > 256) nThreads /= 2;
+  //if (nThreads >= 128) nThreads += WARP_SIZE;
+  for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
+  int channelId = info->channelId;
+  struct ncclChannel* channel = info->comm->channels+channelId;
+
+  // Try to reuse last p2p operation if not full yet
+  int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
+  struct ncclWork* w = channel->workFifo+opIndex;
+  int segment = -1;
+  if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
+    // Try to pack more segments into a single operation
+    segment = getSegment(info, w);
+  }
+  if (segment == -1) {
+    NCCLCHECK(getNextOp(channel, &w, NULL));
+    segment = 0;
+  }
+
+  NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
+  NCCLCHECK(saveP2pOp(info, w, segment));
+  info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
  // Launch asynchronously if needed
  if (ncclAsyncMode()) {
@@ -585,19 +632,17 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
    NCCLCHECKGOTO(checkSetStream(info), ret, end);

-    if (info->coll == ncclCollAllToAllv)
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
-        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-    else
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
        info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
        info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);

-    if (info->coll == ncclCollSendRecv) { //p2p stored separately
+    if (info->coll == ncclFuncSendRecv) { //p2p stored separately
+      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+        info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
+        info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
      NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
    } else {
-      NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
+      NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
    }
 end:
    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
@@ -608,12 +653,7 @@ end:
    NCCLCHECK(ArgsCheck(info));
    NCCLCHECK(checkSetStream(info));

-    if (info->coll == ncclCollAllToAllv)
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
-        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-    else
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
        info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
        info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);

@@ -25,14 +25,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
  for (int c=0; c<nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->ring.prev = channel->ring.next = -1;
-    channel->treeUp.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
-    channel->treeDn.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
-    channel->collTreeUp.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
-    channel->collTreeDn.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
+    channel->tree.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
+    channel->collTree.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;

    int* ringIntra = ringGraph->intra+c*localRanks;
    int* treeIntra = treeGraph->intra+c*localRanks;
@@ -46,33 +42,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
        channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
      }
      if (treeIntra[i] == rank) {
-        int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
-        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+        int parentIndex = 0;
+        int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+        int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;

-        // Tree loop always flows in the same direction. Other trees are symmetric, i.e.
-        // up/down go in reverse directions
-        int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
-
-        // Down tree is common
-        topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
-        topoRanks->treeDnSend[c] = treeIntra[sendIndex];
-        channel->treeDn.up       = treeIntra[prev];
-        channel->treeDn.down[0]  = treeIntra[next];
-        // Up tree depends on the pattern
-        topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
-        topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
-        channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
-        channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
+        topoRanks->treeToParent[c] = treeIntra[parentIndex];
+        topoRanks->treeToChild0[c] = treeIntra[child0Index];
+        topoRanks->treeToChild1[c] = treeIntra[child1Index];
+        channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
+        channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
      }
      if (collNetIntra[i] == rank) {
        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;

-        // CollTrees are always symmetric, i.e.
-        // up/down go in reverse directions
-        channel->collTreeDn.up      = collNetIntra[prev];
-        channel->collTreeDn.down[0] = collNetIntra[next];
-        channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
-        channel->collTreeUp.up      = channel->collTreeDn.up;
+        channel->collTree.up      = collNetIntra[prev];
+        channel->collTree.down[0] = collNetIntra[next];
      }
    }
    topoRanks->ringPrev[c] = channel->ring.prev;
@@ -122,72 +106,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
 return ncclSuccess;
 }

-static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
-  if (u0 != -1) tree0->up = indexes[u0];
-  if (u1 != -1) tree1->up = indexes[u1];
+static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
+  if (u == -1) return ncclSuccess;
+  tree->up = indexes[u];
  return ncclSuccess;
 }

-static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
+static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
+  if (d == -1) return ncclSuccess;
  int x = 0;
-  if (down[x] >= 0) x++;
-  if (down[x] >= 0) {
-    WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
+  while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
+  if (x == NCCL_MAX_TREE_ARITY) {
+    WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
    return ncclInternalError;
  }
-  if (r0 != -1) down[x++] = indexes[r0];
-  if (r1 != -1) down[x++] = indexes[r1];
+  tree->down[x] = indexes[d];
  return ncclSuccess;
 }

-static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
-  NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
-  NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
-  return ncclSuccess;
-}
-
-static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
-  if (tree->down[0] == upRank) tree->down[0] = -1;
-  if (rank == upRank) tree->up = -1;
-  return ncclSuccess;
-}
-
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
  const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-  int* indexesSend, *indexesRecv;
-  NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
-  NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
+  int* ranksToParent, *ranksToChild0, *ranksToChild1;
+  NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
+  NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
+  NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));

  // Compute tree depth. Not an exact value but a good approximation in most
  // cases
  int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);

-  int u0, d0_0, d0_1, u1, d1_0, d1_1;
-  NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+  int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+  NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
  for (int c=0; c<nChannels; c++) {
     struct ncclChannel* channel0 = comm->channels+c;
     struct ncclChannel* channel1 = channel0+nChannels;
-     NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
-     NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
-     NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
-     NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
-     int root = indexesSend[node];
-     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
-     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
-     NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
-     NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
-     NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
-     NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
-     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
-     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
-     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c,           channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
-     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
-     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c,           channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
-     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
-     channel0->treeUp.depth = channel1->treeUp.depth = depth;
+     NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
+     if (comm->rank == ranksToParent[node]) {
+       NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
+       NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
+     }
+     if (comm->rank == ranksToChild0[node]) {
+       NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
+       NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
+     }
+     if (comm->rank == ranksToChild1[node]) {
+       NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
+       NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
+     }
+     if (comm->rank == ranksToParent[node] ||
+         comm->rank == ranksToChild0[node] ||
+         comm->rank == ranksToChild1[node]) {
+       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+     }
+     channel0->tree.depth = channel1->tree.depth = depth;
  }
-  free(indexesSend);
-  free(indexesRecv);
+  free(ranksToParent);
+  free(ranksToChild0);
+  free(ranksToChild1);
  return ncclSuccess;
 }

@@ -200,13 +178,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
    struct ncclChannel* channel = comm->channels+c;
    // Set root of collTree to id nranks
    if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
-      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+      channel->collTree.up = nranks;
    }
    if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+      channel->collTree.down[0] = -1;
    }
-    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
+    channel->collTree.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
  }
  int recvIndex = 0;  // recv GPU index is always 0
  int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
@@ -214,13 +192,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
    struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
    // Set root of collTree to id nranks
    if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
-      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+      channel->collTree.up = nranks;
    }
    if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+      channel->collTree.down[0] = -1;
    }
-    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
+    channel->collTree.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
  }
  return ncclSuccess;
 }
@@ -255,35 +233,33 @@ int ncclMaxNchannels() {
  return maxNchannels;
 }

-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
  // Gather data from all ranks
-  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
+  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
  int nranks = comm->nRanks;
  int nChannels = comm->nChannels;
  NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
  for (int i=0; i<nranks; i++) {
    for (int c=0; c<nChannels;c++) {
      ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
      ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
      ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
      ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
-      treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
-      treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
-      treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
-      treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
+      treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
+      treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
+      treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
    }
  }

  // Connect rings and trees. This should also duplicate the channels.
  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
-  NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
+  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));

  // Duplicate ringPrev/ringNext for ncclBuildRing
  memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -317,10 +293,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
  free(ringSend);
  free(ringPrev);
  free(ringNext);
-  free(treeUpRecv);
-  free(treeUpSend);
-  free(treeDnRecv);
-  free(treeDnSend);
+  free(treeToParent);
+  free(treeToChild0);
+  free(treeToChild1);

  return ncclSuccess;
 }
@@ -166,24 +166,20 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT

          // Start with path type = link type. PATH and LINK types are supposed to match.
          // Don't consider LINK_NET as we only care about the NIC->GPU path.
-          int type = link->type == LINK_NET ? 0 : link->type;
+          int type = link->type == LINK_NET ? LINK_LOC : link->type;
          // Differentiate between one and multiple PCI switches
-          if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+          if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
          // Consider a path going through the CPU as PATH_PHB
          if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
-          // Ignore Power CPU in an NVLink path
-          if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
-              link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+          // Set 1 hop NVLink as NVB
+          if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;

          remPath->type = std::max(path->type, type);

          // Add to the list for the next iteration if not already in the list
-          // Disallow GPUs as intermediate steps for now
-          if (remNode->type != GPU) {
-            int i;
-            for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
-            if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
-          }
+          int i;
+          for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+          if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
        }
      }
    }
@@ -303,7 +299,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
    if (l == -1) {
      char* str = getenv(levelEnv);
      if (str) {
-        for (int i=0; i<PATH_NET; i++) {
+        for (int i=0; i<=PATH_SYS; i++) {
          if (strcmp(str, topoPathTypeStr[i]) == 0) {
            l = i;
            break;
@@ -325,9 +321,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 }

 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
  *p2p = 0;
-  *read = 0;
+  if (read) *read = 0;
+  if (intermediateRank) *intermediateRank = -1;

  // Get GPUs from topology
  int g1, g2;
@@ -337,7 +334,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
    // GPU not found, we can't use p2p.
    return ncclSuccess;
  }
+
+
+  // Set intermediate GPU rank, if routing through an intermediate GPU.
  struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+  if (path->count == 2) {
+    struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
+    if (intermediateNode->type == GPU && intermediateRank) {
+      *intermediateRank = intermediateNode->gpu.rank;
+    }
+  }

  // In general, use P2P whenever we can.
  int p2pLevel = PATH_SYS;
@@ -366,7 +372,7 @@ compare:
  if (path->type == PATH_NVL) {
    struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
    // Enable P2P Read for Ampere/NVLink only
-    if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
+    if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
  }

  return ncclSuccess;
@@ -456,8 +462,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer

    // Update path when we don't want to / can't use GPU Direct P2P
    for (int p=0; p<system->nodes[GPU].count; p++) {
-      int p2p, read;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
+      int p2p;
+      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
      if (p2p == 0) {
        // Divert all traffic through the CPU
        int cpu;
@@ -565,8 +571,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
    // Local rank
    path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
    if (path->type == PATH_NVL) {
-      int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
-      double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
      *nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
    } else {
      *nChannels = 2;
@@ -600,16 +605,9 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
    }
  }

-  if (comm->topo->type == RCCL_TOPO_4P2H_ROME) {
-    // Adjust P2P channels on Rome
-    comm->p2pnChannelsPerPeer = 2;
-    comm->p2pnChannels = 2;
-  }
-  else {
-    // Round to next pow2 nChannelsPerPeer and nChannels
-    comm->p2pnChannelsPerPeer = nextPow2(minChannels);
-    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
-  }
+  // Round to next pow2 nChannelsPerPeer and nChannels
+  comm->p2pnChannelsPerPeer = nextPow2(minChannels);
+  comm->p2pnChannels = nextPow2(comm->p2pnChannels);

  // Init channels that weren't used so far
  for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {

 ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
  for (int r=0; r<nrings; r++) {
-    char prefix[30];
+    char prefix[40];
    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
    dumpLine(prev+r*nranks, nranks, prefix);
    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
@@ -25,9 +25,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
  }
  return maxWidth;
 }
+static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+  float nvlinkWidth = 0.0, pciWidth = 0.0;
+  for (int l=0; l<gpu->nlinks; l++) {
+    struct ncclTopoLink* link = gpu->links+l;
+    if (link->type == LINK_NVL) nvlinkWidth += link->width;
+    if (link->type == LINK_PCI) pciWidth = link->width;
+  }
+  return std::max(pciWidth, nvlinkWidth);
+}
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
  system->maxWidth = 0.0;
-  system->type = 0;
+  system->totalWidth = 0.0;
  int inter = system->nodes[NET].count;
  if (inter == 0 && system->nodes[GPU].count == 1) {
    system->maxWidth = LOC_WIDTH;
@@ -36,6 +45,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
  for (int g=0; g<system->nodes[GPU].count; g++) {
    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
    system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
+    system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
  }
  return ncclSuccess;
 }
@@ -293,7 +303,6 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
 ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
  const uint64_t flag = 1ULL<<(graph->nChannels);
  struct ncclTopoNode* gpu;
-
  NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
  if (gpu) {
    gpu->used ^= flag;
@@ -352,11 +361,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
        if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+
+        // Balanced Tree : count half of the bandwidth on first two GPUs
+        int nextBackToNet = -1;
+        float speedInterSave = graph->speedInter;
+        if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
+          // Count half of the bandwidth on each of the first two GPUs
+          if (step == 0) nextBackToNet = 1;
+          else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
+          graph->speedInter /= 2;
+        }
+
        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+        graph->speedInter = speedInterSave;
        if (net) {
          graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
+
+          if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
+          graph->speedInter = speedInterSave;
        }
      }
    }
@@ -493,13 +517,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
 ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
  if (system->nodes[NET].count) {
    if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
-    else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
-    else *backToNet = 1;
-    if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
-    else *backToFirstRank = -1;
+    else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
+    else *backToNet = 0;
+    *backToFirstRank = -1;
  } else {
    *backToNet = -1;
-    if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
    else *backToFirstRank = -1;
  }
  return ncclSuccess;
@@ -544,7 +567,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
 /* User defined graph from XML file */
 /************************************/

-struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
+struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
 ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  int ngpus = system->nodes[GPU].count;
  int* inter = graph->inter+2*c;
@@ -1062,7 +1085,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #else
-float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #endif
 #define NSPEEDS (sizeof(speedArray)/sizeof(float))

@@ -1109,11 +1132,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph

  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  // TODO: benchmark balance tree vs split tree
+  //if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+#else
+  // SPLIT_TREE works better on older archs.
+  int ccMin;
+  NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
+  if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+#endif
+
  struct ncclTopoGraph tmpGraph;
  memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));

  // First try crossnic, then decrease speed and finally increase speedIntra.
-  tmpGraph.pattern = graph->pattern;
  int pass = 1;
  int speedIndex = 0;
  while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
@@ -1128,7 +1160,7 @@ search:

  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
  for (int c=0; c<graph->nChannels; c++) {
    printf("%2d : ", c);
    for (int g=0; g<ngpus; g++) {
@@ -1138,7 +1170,8 @@ search:
  }
 #endif
  // Optimal solution, stop here
-  if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
+  if (time == -1) goto done;
+  if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;

  if (pass == 1) {
    // First pass, we don't have a solution yet ; try other options
@@ -1152,7 +1185,7 @@ search:

    if (time != -1) globalTimeout += time;
    else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
-    if (globalTimeout < 0) goto done;
+    if (globalTimeout < 0 && graph->nChannels) goto done;

    int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
    if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
@@ -1167,10 +1200,6 @@ search:
    tmpGraph.typeInter = PATH_PIX;

    // Try a simpler tree
-    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
-      tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
-      goto search;
-    }
    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
      goto search;
@@ -20,18 +20,17 @@
 #endif
 #include "xml.h"
 #include "cpuset.h"
-#include <numa.h>

 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))

 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "PCI", "",    "",    "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
 #else
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI", "",    "",    "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
 #endif

 /******************************************************************/
@@ -226,7 +225,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 }

 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
-  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
  char line[1024];
  for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
  INFO(NCCL_GRAPH, "==========================================");
@@ -515,7 +514,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
      }
    }
    if (remote) {
-      int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
      NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
      if (remote->type != GPU) {
        NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
@@ -600,6 +599,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
      struct ncclXmlNode* node;
      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
      if (node == NULL) continue;
+      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
    }
@@ -614,6 +614,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
      NCCLCHECK(collNetGetProperties(n, &props));
      struct ncclXmlNode* netNode;
      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -631,6 +632,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECK(ncclNetGetProperties(n, &props));
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -639,6 +641,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
  }

+  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
+  NCCLCHECK(ncclTopoTrimXml(xml));
+
  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
@@ -747,3 +752,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
  }
  return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[NET].count;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
+  if (system->nodes[GPU].count == 0) return ncclInternalError;
+  int min, max;
+  min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
+  for (int g=1; g<system->nodes[GPU].count; g++) {
+    min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+    max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+  }
+  if (ccMin) *ccMin = min;
+  if (ccMax) *ccMax = max;
+  return ncclSuccess;
+}
@@ -13,8 +13,10 @@
 #include <sched.h>

 #define LOC_WIDTH 5000.0
-#define PASCAL_NVLINK_WIDTH 18.0
-#define VOLTA_NVLINK_WIDTH 21.0
+#define SM60_NVLINK_WIDTH 18.0
+#define SM70_NVLINK_WIDTH 21.0
+#define SM80_NVLINK_WIDTH 21.0
+#define SM86_NVLINK_WIDTH 12.0
 #define PCI_WIDTH 12.0           // PCI Gen3 x16
 #define QPI_WIDTH 6.0
 #define SKL_QPI_WIDTH 9.0
@@ -40,20 +42,21 @@ extern const char* topoNodeTypeStr[];
 // We want link types and path types to match as much as possible
 #define LINK_LOC 0
 #define LINK_NVL 1
-#define LINK_PCI 2
-// Skipping 3 for PATH_PXB
-// Skipping 4 for PATH_PHB
-#define LINK_SYS 5
-#define LINK_NET 6
+// Skipping 2 for PATH_NVB
+#define LINK_PCI 3
+// Skipping 4 for PATH_PXB
+// Skipping 5 for PATH_PHB
+#define LINK_SYS 6
+#define LINK_NET 7
 extern const char* topoLinkTypeStr[];

 #define PATH_LOC 0
 #define PATH_NVL 1
-#define PATH_PIX 2
-#define PATH_PXB 3
-#define PATH_PHB 4
-#define PATH_SYS 5
-#define PATH_NET 6
+#define PATH_NVB 2
+#define PATH_PIX 3
+#define PATH_PXB 4
+#define PATH_PHB 5
+#define PATH_SYS 6
 extern const char* topoPathTypeStr[];

 struct ncclTopoNode;
@@ -125,6 +128,7 @@ struct ncclTopoNodeSet {
 struct ncclTopoSystem {
  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
  float maxWidth;
+  float totalWidth;
  int type;
 };

@@ -141,6 +145,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
 ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
 ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);

+ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
+
 static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
  *index = -1;
  for (int i=0; i<system->nodes[type].count; i++) {
@@ -163,4 +169,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
  return ncclInternalError;
 }

+// Returns NVLink speed in GB/s
+static float ncclTopoNVLinkSpeed(int cudaCompCap) {
+  return
+    cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
+    cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
+    cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
+    cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
+    SM80_NVLINK_WIDTH;
+}
 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -28,7 +28,7 @@
 *    / \     / \     /  \     \
 *   1   3   5   7   9   11    13
 */
-ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
  int up, down0, down1;
  int bit;
  for (bit=1; bit<nranks; bit<<=1) {
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {

  if (rank == 0) {
    *u = -1;
-    *d0 = nranks > 1 ? bit >> 1 : -1;
-    *d1 = -1;
+    *d0 = -1;
+    // Child rank is > 0 so it has to be our child 1, not 0.
+    *d1 = nranks > 1 ? bit >> 1 : -1;
    return ncclSuccess;
  }

  up = (rank ^ bit) | (bit << 1);
+  // if smaller than the parent, we are his first child, otherwise we're his second
  if (up >= nranks) up = (rank ^ bit);
+  *parentChildType = (rank < up) ? 0 : 1;
  *u = up;

  int lowbit = bit >> 1;
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
 }

 /* Build a double binary tree. Take the previous tree for the first tree.
- * For the second tree, we use a mirror tree (if nranks is odd)
+ * For the second tree, we use a mirror tree (if nranks is even)
 *
- *                 8---------0---------5
- *          ______/ \______      _____/ \______
- *         4               12   1              9
- *       /   \            /      \           /   \
- *     2       6       10          3       7      10
- *    / \     / \     /  \        / \     / \    /  \
- *   1   3   5   7   9   11      2   4   6   8  11  12
+ * 0---------------8                   3----------------11
+ *          ______/ \                 / \______
+ *         4         \               /         7
+ *       /   \        \             /        /   \
+ *     2       6       10         1        5      9
+ *    / \     / \     /  \       / \      / \    / \
+ *   1   3   5   7   9   11     0   2    4   6  8   10
 *
- * or shift it by one rank (if nranks is even)
+ * or shift it by one rank (if nranks is odd).
 *
- *                 8---------0--------------9
- *          ______/ \                ______/ \
- *         4         \              5         \
- *       /   \        \           /   \        \
- *     2       6       10       3       7       11
- *    / \     / \     /  \     / \     / \     /  \
- *   1   3   5   7   9   11   2   4   6   8   10   1
+ * 0---------------8            1---------------9
+ *          ______/ \______              ______/ \______
+ *         4               12           5                0
+ *       /   \            /           /   \            /
+ *     2       6       10           3       7       11
+ *    / \     / \     /  \         / \     / \     /  \
+ *   1   3   5   7   9   11       2   4   6   8  10   12
 */
-ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
  // First tree ... use a btree
-  ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+  ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
  // Second tree ... mirror or shift
-  if (nranks % 2 == 0) {
+  if (nranks % 2 == 1) {
    // shift
    int shiftrank = (rank-1+nranks) % nranks;
    int u, d0, d1;
-    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
    *s1 = u == -1 ? -1 : (u+1) % nranks;
    *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
    *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
  } else {
    // mirror
    int u, d0, d1;
-    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
    *s1 = u == -1 ? -1 : nranks-1-u;
    *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
    *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
@@ -71,45 +71,66 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 };

 // LL128 max BW (per channel) for the different collectives
-// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
-static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
+// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
+static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
+static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
+static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
-  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
+  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+#else
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
+#endif
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
    getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);

-  if (comm->nRanks <= 1) return ncclSuccess;
+  int nNodes = comm->nNodes;
+  int nRanks = comm->nRanks;
+  if (nRanks <= 1) return ncclSuccess;

  int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
-  float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  int index2 = nNodes <= 2 ? nNodes-1 : 2;
+  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
+  int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
+  double llMaxBw = llMaxBws[index1][index2];
+  double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
+  float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+
  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;

  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
-    int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
-      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
-      comm->nRanks;
-    int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
-      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
-      comm->nNodes;
+    int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
+      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
+      nRanks;
+    int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
+      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
+      nNodes;

    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
+      if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
        float busBw = graphs[a]->nChannels * speed;

        // Various model refinements
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= 1.0/5.0;
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
        double maxTreeBw = comm->nNodes > 2 ?
@@ -118,21 +139,29 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
+#else
+        if (compCap80) busBw = std::min(busBw, 235.0f);
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
+        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
+#endif
        if (a == NCCL_ALGO_COLLNET) busBw *= .9;
        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0;  // CollNet does not support LL128

        // Convert bus BW to algorithm BW
-        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
+        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
        comm->bandwidths[coll][a][p] = busBw * ratio;

        comm->latencies[coll][a][p] = baseLat[a][p];
        float intraLat = hwLat[intraHw[a]][a][p];
        float interLat = hwLat[NCCL_HW_NET][a][p];
-        if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
+        if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
        if (a == NCCL_ALGO_RING) {
          float lat = hwLat[hw[a]][a][p];
-          if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
+          if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
            if (ringGraph->sameChannels) {
              comm->latencies[coll][a][p] += lat;
            } else {
@@ -144,10 +173,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
          }
        } else if (a == NCCL_ALGO_TREE) {
          comm->latencies[coll][a][p] +=
-            2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+            2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
        } else {
          comm->latencies[coll][a][p] +=
-            2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
+            2 * (nRanks/nNodes-1) * intraLat + interLat;
        }
      }
    }
@@ -168,6 +197,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
  }
+  // Disable CollNet if it is not supported
+  if (comm->collNetSupport == 0) {
+    algoEnable[NCCL_ALGO_COLLNET] = 0;
+    // If user has hard set NCCL_ALGO=COLLNET, ignore it
+    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
+      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
+      if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
+    }
+  }

  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    int pEnable = protoEnable[p];
@@ -178,7 +216,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
    // Only disable algo for Allreduce since others only have one
-    if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
  }

  if (comm->rank == 0) {
@@ -214,7 +252,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
    comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
  }
-  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
+  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;

  // Override defaults with user env
  char* str = getenv("NCCL_THREAD_THRESHOLDS");
@@ -263,8 +301,16 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
    *time = -1.0; return ncclSuccess;
  }
  int logSize = log2i(info->nBytes>>6);
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
  else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
+#else
+  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
+  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
+      && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
+#endif    
  *time = lat + (info->nBytes) / (1000 * bw);
  return ncclSuccess;
 }
@@ -572,7 +572,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
  NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
  if (index == -1) {
    if (nvmlDev == NULL) {
-      //WARN("No NVML, trying to use CUDA instead");
      const char* busId;
      NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
      if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
@@ -714,6 +713,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
        char* path;
        NCCLCHECK(getPciPath(busId, &path));
        NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+        free(path);
      }
    }
  }
@@ -725,10 +725,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
  struct ncclXmlNode* node;
  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-  nvmlDevice_t nvmlDev;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  nvmlDevice_t nvmlDev = NULL;
+  static int nvmlInit = 0;
+  if (nvmlInit == 0) {
+    nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
+  }
+  if (nvmlInit == 1) {
+    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  }
  NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
  return ncclSuccess;
 }
@@ -771,12 +775,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
    strcpy(busId, pciSysPath+offset+1);
-    NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
-    if (parent == NULL) {
-      NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
-      NCCLCHECK(xmlSetAttr(parent, "busid", busId));
-      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
-    }
+    NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
+    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
  } else {
    // Virtual NIC, no PCI device, attach to first CPU
    NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
@@ -795,6 +795,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
  return ncclSuccess;
 }

+ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, "keep", &str));
+  if (str && strcmp(str, "1") == 0) {
+    NCCLCHECK(xmlUnsetAttr(node, "keep"));
+  } else {
+    // Copy nSubs and subs as they could change as we trim recursively.
+    struct ncclXmlNode* subs[MAX_SUBS];
+    int nSubs = node->nSubs;
+    memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
+    for (int s=0; s<nSubs; s++) {
+      NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
+    }
+    if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
+  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
+  return ncclSuccess;
+}
+
 /**************************************************/
 /* Parser rules for the user-defined graph search */
 /**************************************************/
@@ -8,7 +8,7 @@
 #define XML_H_

 // A few constraints to make the implementation easy
-#define MAX_STR_LEN 256
+#define MAX_STR_LEN 255
 #define MAX_ATTR_COUNT 16
 #define MAX_SUBS 32
 #define MAX_NODES 1024
@@ -19,10 +19,10 @@
 #define NODE_TYPE_SINGLE 3

 struct ncclXmlNode {
-  char name[MAX_STR_LEN];
+  char name[MAX_STR_LEN+1];
  struct {
-    char key[MAX_STR_LEN];
-    char value[MAX_STR_LEN];
+    char key[MAX_STR_LEN+1];
+    char value[MAX_STR_LEN+1];
  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
  int nAttrs;
  int type;
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
 ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
 ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);

+/* Remove unneeded parts */
+ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
+
 /**************/
 /* XML Struct */
 /* Functions  */
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
  *index = -1;
  const int nAttrs = node->nAttrs;
  for (int a=0; a<nAttrs; a++) {
-    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
+    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
      *index = a;
      return ncclSuccess;
    }
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
  if (index == -1) {
    index = node->nAttrs++;
    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
  }
  strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
  return ncclSuccess;
 }

@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
  if (index == -1) {
    index = node->nAttrs++;
    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
  }
  snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
  return ncclSuccess;
 }

@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
  if (index == -1) {
    index = node->nAttrs++;
    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
  }
  snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) return ncclSuccess;
+  for (int i=index+1; i<node->nAttrs; i++) {
+    strcpy(node->attrs[i-1].key, node->attrs[i].key);
+    strcpy(node->attrs[i-1].value, node->attrs[i].value);
+  }
+  node->nAttrs--;
  return ncclSuccess;
 }

@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
  s->parent = parent;
  if (parent) parent->subs[parent->nSubs++] = s;
  strncpy(s->name, subName, MAX_STR_LEN);
+  s->name[MAX_STR_LEN] = '\0';
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
+  node->type = NODE_TYPE_NONE;
+  struct ncclXmlNode* parent = node->parent;
+  if (parent == NULL) return ncclSuccess;
+  int shift = 0;
+  for (int s=0; s<parent->nSubs; s++) {
+    if (parent->subs[s] == node) shift = 1;
+    else if (shift) parent->subs[s-1] = parent->subs[s];
+  }
+  parent->nSubs--;
  return ncclSuccess;
 }

@@ -35,7 +35,6 @@ struct ncclInitArgs {
 };
 struct ncclCollArgs {
  ncclComm_t comm;
-  int connect;
 };

 enum ncclAsyncFuncType {
@@ -110,6 +109,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {

 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  if (ncclGroupMode == 0) {
    memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
  }
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupStart() {
 }

 static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
-  struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
+  struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
    sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
    1, 1 };
  info.delta = delta;
@@ -126,26 +126,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
  info.sendbytes = sendbytes;
  info.recvbytes = recvbytes;
  if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
-  NCCLCHECK(ncclSaveKernel(&info));
+  NCCLCHECK(ncclSaveP2pKernel(&info));
  return ncclSuccess;
 }

 void* ncclAsyncThreadPreconnect(void* args_) {
  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
-  for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
-    struct ncclComm* comm = args->coll.comm;
-    struct ncclChannel* channel = comm->channels+c;
-    struct ncclP2PConnect* connect = &comm->p2plist.connect;
-    NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
-    connect->nrecv[c] = 0;
-    connect->nsend[c] = 0;
-  }
+  struct ncclComm* comm = args->coll.comm;
+  CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
+  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
  return args;
 }

+static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
+  size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
+  int nChannels = minChannels;
+  while (size > maxSize && nChannels <= maxChannels/2) {
+    nChannels *= 2;
+    size = DIVUP(totalSize, nChannels);
+  }
+  ALIGN_SIZE(size, minSize);
+  return size;
+}
+
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  if (ncclGroupMode == 0) {
    WARN("ncclGroupEnd: not in a group call.");
    return ncclInvalidUsage;
@@ -186,29 +192,21 @@ ncclResult_t ncclGroupEnd() {

  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
-      if (p2plist->count != 0) {
-        struct ncclComm* comm = args->coll.comm;
-        args->coll.connect = 0;
-        for (int c=0; c<comm->p2pnChannels; c++)
-          args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
-        if (args->coll.connect) {
-          pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-        }
-      }
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
+      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
    }
  }

  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
      int err = pthread_join(ncclGroupThreads[i], NULL);
      if (err != 0) {
        WARN("Error waiting for pthread_join : %s\n", strerror(errno));
        return ncclSystemError;
      }
      NCCLCHECKGOTO(args->ret, ret, end);
+      args->coll.comm->connect = 0;
    }
  }

@@ -218,56 +216,98 @@ ncclResult_t ncclGroupEnd() {
      struct ncclComm* comm = args->coll.comm;
      int rank = comm->rank;
      int nRanks = comm->nRanks;
-      struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
-      if (p2plist->count) {
-        for (int delta=0; delta<nRanks; delta++) {
+      struct ncclP2Plist* p2pSends = comm->p2pSends;
+      struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
+
+      // Compute how much to split operations
+      // Natural step size matching buffer steps.
+      ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+      // Try to use all channels
+      int nChannelsMax = comm->p2pnChannelsPerPeer;
+      int nChannelsMin = nChannelsMax;
+      // Try to use all channels, but one channel per operation.
+      while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
+      // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
+      while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+
+      while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
+        // schedule delta 0, +1, -1, +2, -2, ...
+        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
+        for (int d=0; d<=nRanks/4; d++) {
+          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
+          int index = 0;
+          int delta = deltas[index];
+sched_delta:
          uint32_t from = (rank+nRanks-delta)%nRanks;
          uint32_t to = (rank+delta)%nRanks;
+          struct ncclP2Pinfo* recv = p2pRecvs[from].head;
+          struct ncclP2Pinfo* send = p2pSends[to].head;
+          if (recv != NULL || send != NULL) {
+            ssize_t totRecvBytes = -1, totSendBytes = -1;
+            if (recv != NULL) totRecvBytes = recv->nbytes;
+            if (send != NULL) totSendBytes = send->nbytes;
+            ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+            ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);

-          // Compute how much to split operations
-          // Natural step size matching buffer steps.
-          ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
-          // Split each operation on p2pnChannelsPerPeer max.
-          ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
-          ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
-          recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
-          sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
-
-          ssize_t sendOffset = 0;
-          ssize_t recvOffset = 0;
-          int remaining = 1;
-          int chunk = 0;
-          while (remaining) {
-            int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
-            remaining = 0;
-            ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
-            ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
-            if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
-            if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
-            if (sendbytes >= 0 || recvbytes >= 0) {
-              NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
-                    recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
-                    sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
+            ssize_t sendOffset = 0;
+            ssize_t recvOffset = 0;
+            int sendRemaining = 1, recvRemaining = 1;
+            int chunk = 0;
+            do {
+              int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
+              ssize_t recvbytes = totRecvBytes-recvOffset;
+              ssize_t sendbytes = totSendBytes-sendOffset;
+              if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
+              if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+              if (sendbytes >= 0 || recvbytes >= 0) {
+                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
+                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
+                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
+              }
+              recvOffset += recvChunkSize;
+              sendOffset += sendChunkSize;
+              chunk++;
+            } while (sendRemaining || recvRemaining);
+            if (recv) {
+              NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
+              comm->p2pRecvCount--;
            }
-            recvOffset += recvChunkSize;
-            sendOffset += sendChunkSize;
-            chunk++;
+            if (send) {
+              NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
+              comm->p2pSendCount--;
+            }
+          }
+          index++;
+          if (index == 1 && deltas[1] == deltas[0]) index++;
+          if (index == 2 && deltas[2] == deltas[0]) index++;
+          if (index == 3 && deltas[3] == deltas[2]) index++;
+          if (index == 3 && deltas[3] == deltas[1]) index++;
+          if (index < 4) {
+            delta = deltas[index];
+            goto sched_delta;
          }
        }
-        p2plist->count = 0;
      }
    }
  }

  /* Collectives are done in three steps :
+   * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
   * 2. Barrier Wait. No CUDA call is permitted
   * 3. Enqueue Events. CUDA event wait/enqueue.
   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * hipFree happens between 1 and 3, it could block that CUDA call and
+   * cudaFree happens between 1 and 3, it could block that CUDA call and
   * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the hipFree call.
+   * prevent the NCCL call from completing, blocking the cudaFree call.
   */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      ncclComm_t comm = args->coll.comm;
+      NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
+    }
+  }
  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
    if (args->funcType == ASYNC_FUNC_COLL) {
@@ -304,32 +344,28 @@ group_cleanup:
        *args->init.newcomm = NULL;
      } else {
        struct ncclComm* comm = args->coll.comm;
-        for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
-          struct ncclChannel* channel = comm->channels+c;
-          for (int i=0; i<channel->collCount; i++) {
-            channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+        // Reset aggregation counters
+        comm->asyncOpCount = 0;
+        comm->asyncTotalSize = 0;
+        // Dequeue p2p lists
+        if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
+          struct ncclP2Plist* p2pSends = comm->p2pSends;
+          struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
+          for (int peer=0; peer<comm->nRanks; peer++) {
+            while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
+            while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
          }
-          channel->collFifoTail = channel->collStart;
-          channel->collCount = 0;
+          comm->p2pSendCount = comm->p2pRecvCount = 0;
        }
-        /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
+        /* Free all proxy ops in state->nextOps */
        struct ncclProxyState* state = &comm->proxyState;
-        struct ncclProxyArgs *op, *start;
-        pthread_mutex_lock(&state->mutex);
-        op = start = state->ops;
-        while (op) {
-          if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
-          struct ncclProxyArgs* peerOp = op->nextPeer;
-          while (peerOp) {
-            if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
-            peerOp = peerOp->nextPeer;
-          }
-          op = op->next;
-          if (op == start) break;
+	pthread_mutex_lock(&state->poolMutex);
+	for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
+          op->next = state->pool;
+          state->pool = op;
        }
-        comm->opCount = comm->lastOpCount;
-        pthread_cond_signal(&state->cond);
-        pthread_mutex_unlock(&state->mutex);
+	pthread_mutex_unlock(&state->poolMutex);
+        state->nextOps = NULL;

        comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
        comm->userStreamSet = false;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
+ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
 #endif
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
 static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
 static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
@@ -8,63 +8,60 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_

-#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
-#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
-  ? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
-  : ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
+#define FUNC_INDEX_P2P 1800
+#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))

-#define NCCL_COLL_NAME(coll, op, dtype) \
-  coll##_##op##_##dtype
+#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
+  ncclFunction_##func##_##algo##_##proto##_##redop##_##type

-#define NCCL_KERN_NAME(coll, op, dtype) \
-  coll##Kernel_##op##_##dtype
+#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
+  ncclKernel_##func##_##algo##_##proto##_##redop##_##type
+
+#define NCCL_IMPL_NAME(func, algo, proto) \
+  nccl##func##algo##proto

 /* Declare all collective operations */
-#define DECL_COLL5(coll, op, dtype) \
-  extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
+#define DECL5(func, algo, proto, redop, type) \
+  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \

-#define DECL_COLL4(coll, op, dtype) \
-  DECL_COLL5(coll, op, dtype) \
-  DECL_COLL5(coll##LL, op, dtype) \
-  DECL_COLL5(coll##LL128, op, dtype)
+#define DECL4(func, algo, redop, type) \
+  DECL5(func, algo, SIMPLE, redop, type) \
+  DECL5(func, algo, LL,     redop, type) \
+  DECL5(func, algo, LL128,  redop, type)

-#define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##Ring, op, dtype) \
-  DECL_COLL4(coll##Tree, op, dtype) \
-  DECL_COLL4(coll##CollNet, op, dtype)
+#define DECL3(func, redop, type) \
+  DECL4(func, RING,    redop, type) \
+  DECL4(func, TREE,    redop, type) \
+  DECL4(func, COLLNET, redop, type)

-#define DECL_COLL2(coll, op) \
-  DECL_COLL3(coll, op, i8) \
-  DECL_COLL3(coll, op, u8) \
-  DECL_COLL3(coll, op, i32) \
-  DECL_COLL3(coll, op, u32) \
-  DECL_COLL3(coll, op, i64) \
-  DECL_COLL3(coll, op, u64) \
-  DECL_COLL3(coll, op, f16) \
-  DECL_COLL3(coll, op, f32) \
-  DECL_COLL3(coll, op, f64) \
-  DECL_COLL3(coll, op, b16)
+#define DECL2(func, redop) \
+  DECL3(func, redop, int8_t) \
+  DECL3(func, redop, uint8_t) \
+  DECL3(func, redop, int32_t) \
+  DECL3(func, redop, uint32_t) \
+  DECL3(func, redop, int64_t) \
+  DECL3(func, redop, uint64_t) \
+  DECL3(func, redop, half) \
+  DECL3(func, redop, float) \
+  DECL3(func, redop, double) \
+  DECL3(func, redop, rccl_bfloat16)

-#define DECL_COLL(coll) \
-  DECL_COLL2(coll, sum) \
-  DECL_COLL2(coll, prod) \
-  DECL_COLL2(coll, min) \
-  DECL_COLL2(coll, max)
+#define DECL(func) \
+  DECL2(func, Sum) \
+  DECL2(func, Prod) \
+  DECL2(func, Min) \
+  DECL2(func, Max)

-#define DECL_ALL_COLLS \
-  DECL_COLL2(ncclBroadcast, copy) \
-  DECL_COLL(ncclReduce) \
-  DECL_COLL2(ncclAllGather, copy) \
-  DECL_COLL(ncclReduceScatter) \
-  DECL_COLL(ncclAllReduce) \
-  DECL_COLL5(ncclGather, copy, i8) \
-  DECL_COLL5(ncclScatter, copy, i8) \
-  DECL_COLL5(ncclAllToAll, copy, i8) \
-  DECL_COLL5(ncclAllToAllv, copy, i8) \
-  DECL_COLL5(ncclSendRecv, copy, i8) \
+#define DECL_ALL \
+  DECL2(Broadcast, Sum) \
+  DECL(Reduce) \
+  DECL2(AllGather, Sum) \
+  DECL(ReduceScatter) \
+  DECL(AllReduce) \
+  DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \

-DECL_ALL_COLLS
+DECL_ALL

 // CHUNKSIZE must be a multiple of SLICESIZE
 //#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -84,13 +81,4 @@ DECL_ALL_COLLS
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define SENDRECV_SLICEFACTOR 1
-#define GATHER_SLICESTEPS 4
-#define GATHER_CHUNKSTEPS 4
-#define SCATTER_SLICESTEPS 4
-#define SCATTER_CHUNKSTEPS 4
-#define ALLTOALL_SLICESTEPS 4
-#define ALLTOALL_CHUNKSTEPS 4
-#define ALLTOALLV_SLICESTEPS 4
-#define ALLTOALLV_CHUNKSTEPS 4
-
 #endif
@@ -52,8 +52,8 @@ struct ncclRecvMem {
    struct {
      uint64_t tail;
      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
      int sizesFifo[NCCL_STEPS];
+      void* ptrsFifo[NCCL_STEPS];
    };
    char pad4[MEM_ALIGN];
  };
@@ -67,6 +67,10 @@ struct ncclComm {
  struct ncclTopoSystem* topo;

  void* bootstrap;
+  // Bitmasks for ncclTransportP2pSetup
+  int connect;
+  uint32_t* connectSend;
+  uint32_t* connectRecv;

  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
@@ -131,8 +135,8 @@ struct ncclComm {
  int* intraCudaDevs;
  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclDevComm* args;
-  struct ncclDevComm** argsptr;
+  struct ncclWorkElem args;
+  void* argsptr;

  // Global proxy thread
  pthread_t proxyThread;
@@ -140,8 +144,17 @@ struct ncclComm {

  // Whether this communicator uses collNet
  int collNetSupport;
+
+  // Store info of async operations
+  struct ncclInfo* asyncOps;
+  int asyncOpCount;
+  size_t asyncTotalSize;
+
  //list of async p2p operation queued in a group semantics
-  struct ncclP2Plist p2plist;
+  struct ncclP2Plist* p2pSends;
+  struct ncclP2Plist* p2pRecvs;
+  int p2pSendCount;
+  int p2pRecvCount;

  // RCCL AllToAll/Scatter/Gather API
  bool alltoallDisable;
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
 #include "alloc.h"
 #include "utils.h"
 #include "param.h"
+#include "nvtx_stub.h"

 #endif // end include guard
@@ -19,7 +19,7 @@ static int hexToInt(char c) {

 #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))

-ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
+static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
  uint32_t cpumasks[CPU_SET_N_U32];
  int m = CPU_SET_N_U32-1;
  cpumasks[m] = 0;
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
  return ncclSuccess;
 }

-ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
  int c = 0;
  uint8_t* m8 = (uint8_t*)mask;
  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
@@ -23,8 +23,8 @@
 #endif

 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
-extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];

 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
@@ -59,6 +59,7 @@ union ncclLLFifoLine {
 #define WARP_SIZE 64
 #define MAXCHANNELS 32
 #define NCCL_MAX_NTHREADS 256
+#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
 #define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
 #define NCCL_LL_LINES_PER_THREAD 8
 #ifdef TEST_LL_CLEANUP
@@ -72,7 +73,7 @@ union ncclLLFifoLine {
 // Make sure the clean mask will last for at least NCCL_NSTEPS
 static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");

-#define NCCL_LL128_LINESIZE 64
+#define NCCL_LL128_LINESIZE 128
 #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
 #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)

@@ -83,15 +84,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 // to 3 dests. Use 70% for reduce and 30% for bcast.
 #define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)

-#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

 #define NCCL_DIRECT_GPU 0x01
 #define NCCL_DIRECT_NIC 0x10

-#define MAXBARRIERS 2
-#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -99,9 +97,11 @@ struct ncclConnInfo {
  uint64_t *head;     // Local for send, remote for recv

  int direct;         // Direct communication
+  int shared;         // Buffers are shared
  void **ptrExchange; // Pointer exchange for direct communication

-  int *fifo;          // Size fifo for proxy
+  int *sizesFifo;     // Sizes fifo from GPU to proxy
+  void* *ptrsFifo;      // Buffer fifo from proxy to GPU

  uint64_t step;      // Keep where we are
  uint64_t llLastCleaning;
@@ -110,7 +110,6 @@ struct ncclConnInfo {
  // allows software to explicitly initiate a flush read to HDP memory. See more
  // descriptions in primitives.h.
  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct ncclConnector {
@@ -151,68 +150,53 @@ struct ncclDevComm;

 #pragma pack(push)  /* push current alignment to stack */
 #pragma pack(4)     /* set alignment to 4 bytes boundary */
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclDevComm* comm;
-  uint64_t opCount;
+#define NCCL_MAX_WORK_ELEMENTS 2
+#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
+
+/* ncclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclWorkElem. */
+struct ncclWorkElem {
+  // Header
+  struct ncclDevComm* comm;
+  uint16_t nThreads;
+  uint16_t funcIndex;
+  uint16_t index;
+  uint16_t active;

-  // local and remote input, output, and buffer
  const void * sendbuff;
  void * recvbuff;

-  // Op-specific fields. Make sure the common part stays the
-  // same on all structs of the union
+  uint64_t opCount;
+  // Op-specific fields.
  union {
    struct {
-      uint16_t nThreads;
-    } common;
-    struct {
-      uint16_t nThreads;
-      uint8_t bid;
-      uint8_t nChannels;
-      uint32_t root;
      size_t count;
      size_t lastChunkSize;
-    } coll;
-    struct {
-      uint16_t nThreads;
-      uint16_t unused;
-      int32_t delta;
-      size_t sendCount;
-      size_t recvCount;
-    } p2p;
-    struct {
-      uint16_t nThreads;
+      uint32_t root;
      uint8_t bid;
      uint8_t nChannels;
-      size_t count;
-      size_t* extra;
-    } a2av;
-  };
-};
-struct ncclColl {
-  union {
+    } coll;
    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
+      size_t sendCount;
+      size_t recvCount;
+      int32_t delta;
+      uint16_t nThreads;
+    } p2p;
+    uint64_t align[3];
  };
 };
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+struct ncclWork {
+  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
+};
+static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");

 struct ncclChannel {
  union {
    struct {
      struct ncclRing ring;
-      struct ncclTree treeUp;
-      struct ncclTree treeDn;
-      struct ncclTree collTreeUp;
-      struct ncclTree collTreeDn;
+      struct ncclTree tree;
+      struct ncclTree collTree;

      int id;

@@ -221,16 +205,10 @@ struct ncclChannel {
      struct ncclPeer* devPeers;

      // Operation list for aggregation
-      struct ncclColl* collectives;
-      size_t* collectivesExtra;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
+      struct ncclWork* workFifo;
+      int workCount;
+      uint64_t workFifoTail; // Only used by CPU

-      uint32_t* sync;
-      uint64_t* barrier;
-      uint64_t* barrier_next;
 #ifdef ENABLE_PROFILING
      struct timeval tvs;
      uint64_t sizes;
@@ -288,9 +266,11 @@ struct ncclProf {

 #ifdef ENABLE_COLLTRACE
 typedef enum {
+  ncclCollTraceNotReady,
  ncclCollTraceKernelLaunchType,
  ncclCollTraceCollEndType,
-  ncclCollTraceAbortType
+  ncclCollTraceAbortType,
+  ncclCollTraceDataType
 } ncclCollTraceDataType_t;

 struct ncclCollTrace {
@@ -304,7 +284,7 @@ struct ncclCollTrace {
 };
 static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");

-#define COLLTRACE_NUM_ITEMS 1024
+#define COLLTRACE_NUM_ITEMS 8192
 #endif

 struct ncclDevComm {
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
 ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
 ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
 ncclResult_t ncclSaveKernel(struct ncclInfo* info);
+ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
+ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);

 #endif // End include guard
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);

 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);

 // Set CPU affinity
@@ -45,15 +45,16 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
 #define NCCL_TOPO_CPU_TYPE_ZEN 3
 #define NCCL_TOPO_CPU_TYPE_ROME 4
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
+ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);

 #define NCCL_TOPO_MAX_NODES 256

 // Init search. Needs to be done before calling ncclTopoCompute
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);

-#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
-#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
-#define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
+#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
+#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 struct ncclTopoGraph {
  // Input / output
@@ -84,17 +85,16 @@ struct ncclTopoRanks {
  int ringSend[MAXCHANNELS];
  int ringPrev[MAXCHANNELS];
  int ringNext[MAXCHANNELS];
-  int treeUpRecv[MAXCHANNELS];
-  int treeUpSend[MAXCHANNELS];
-  int treeDnRecv[MAXCHANNELS];
-  int treeDnSend[MAXCHANNELS];
+  int treeToParent[MAXCHANNELS];
+  int treeToChild0[MAXCHANNELS];
+  int treeToChild1[MAXCHANNELS];
 };

 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
    struct ncclTopoRanks* topoRanks);

-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
    struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);

 ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
@@ -20,8 +20,7 @@ typedef enum {
  ncclPatternTreeDown,
  ncclPatternTreeUpDown,
  ncclPatternCollTreeUp,
-  ncclPatternCollTreeDown,
-  ncclPatternAll
+  ncclPatternCollTreeDown
 } ncclPattern_t;

 // Used to pass NCCL call information between functions
@@ -40,11 +39,6 @@ struct ncclInfo {
  // Algorithm details
  int chunkSteps;
  int sliceSteps;
-  // For alltoallv
-  const size_t *sendcounts;
-  const size_t *sdispls;
-  const size_t *recvcounts;
-  const size_t *rdispls;
  // Computed later
  int algorithm;
  int protocol;
@@ -15,6 +15,9 @@
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2

+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 8
+
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;

@@ -29,9 +32,9 @@ typedef struct {
  int speed;      // Port speed in Mbps.
  int port;       // Port number.
  int maxComms;   // Maximum number of comms we can create
-}ncclNetProperties_v3_t;
+}ncclNetProperties_v4_t;

-typedef ncclNetProperties_v3_t ncclNetProperties_t;
+typedef ncclNetProperties_v4_t ncclNetProperties_t;

 typedef struct {
  // Name of the network (mainly for logs)
@@ -41,7 +44,7 @@ typedef struct {
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@@ -62,7 +65,7 @@ typedef struct {
  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
  // visible to the GPU
-  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
  // Test whether a request is complete. If size is not NULL, it returns the
  // number of bytes sent/received.
  ncclResult_t (*test)(void* request, int* done, int* size);
@@ -70,11 +73,11 @@ typedef struct {
  ncclResult_t (*closeSend)(void* sendComm);
  ncclResult_t (*closeRecv)(void* recvComm);
  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v3_t;
+} ncclNet_v4_t;

-typedef ncclNet_v3_t ncclNet_t;
+typedef ncclNet_v4_t ncclNet_t;

-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4

 typedef struct {
  // Name of the collective network (mainly for logs)
@@ -85,7 +88,7 @@ typedef struct {
  // If ndev returns 0, all other functions might be set to NULL.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create connections.
@@ -105,17 +108,17 @@ typedef struct {
      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
  // visible to the GPU
-  ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
  // Test whether a request is complete. If size is not NULL, it returns the
  // number of bytes sent/received.
  ncclResult_t (*test)(void* request, int* done, int* size);
  // Close and free collective comm objects
  ncclResult_t (*closeColl)(void* collComm);
  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v3_t;
+} ncclCollNet_v4_t;

-typedef ncclCollNet_v3_t ncclCollNet_t;
+typedef ncclCollNet_v4_t ncclCollNet_t;

-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4

 #endif // end include guard
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
 static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
  NVMLCHECK(nvmlDeviceGetIndex(device, index));
  return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
-  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
  return ncclSuccess;
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
  return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
-  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
  NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
  return ncclSuccess;
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
 ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);

 #endif // NVML_DIRECT
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVTX_H_
+#define NCCL_NVTX_H_
+
+#include "nvtx3.hpp"
+
+struct nccl_domain{static constexpr char const* name{"NCCL"};};
+
+#endif
@@ -0,0 +1,141 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include "cuda.h"
+
+#ifndef NVTOOLSEXT_CUDA_V3
+#define NVTOOLSEXT_CUDA_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDA  4
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDA
+*/
+typedef enum nvtxResourceCUDAType_t
+{
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
+} nvtxResourceCUDAType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCuda_v3.h"
+#undef NVTX_IMPL_GUARD_CUDA
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_CUDA_V3 */
@@ -0,0 +1,117 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include "cuda.h"
+#include "driver_types.h"
+
+#ifndef NVTOOLSEXT_CUDART_V3
+#define NVTOOLSEXT_CUDART_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDART 5
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDART
+*/
+typedef enum nvtxResourceCUDARTType_t
+{
+    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
+} nvtxResourceCUDARTType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The id of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceW
+  #define nvtxNameCudaStream nvtxNameCudaStreamW
+  #define nvtxNameCudaEvent  nvtxNameCudaEventW
+#else
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceA
+  #define nvtxNameCudaStream nvtxNameCudaStreamA
+  #define nvtxNameCudaEvent  nvtxNameCudaEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCudaRt_v3.h"
+#undef NVTX_IMPL_GUARD_CUDART
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_CUDART_V3 */
@@ -0,0 +1,191 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include <CL/cl.h>
+
+#ifndef NVTOOLSEXT_OPENCL_V3
+#define NVTOOLSEXT_OPENCL_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6 
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
+} nvtxResourceOpenCLType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplOpenCL_v3.h"
+#undef NVTX_IMPL_GUARD_OPENCL
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_OPENCL_V3 */
@@ -0,0 +1,382 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#ifndef NVTOOLSEXT_SYNC_V3
+#define NVTOOLSEXT_SYNC_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* \cond SHOW_HIDDEN 
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
+/** \endcond */
+
+
+/** 
+* \page PAGE_SYNCHRONIZATION Synchronization
+*
+* This section covers a subset of the API that allow users to track additional
+* synchronization details of their application.   Naming OS synchronization primitives 
+* may allow users to better understand the data collected by traced synchronization 
+* APIs.  Additionally, a user defined synchronization object can allow the users to
+* to tell the tools when the user is building their own synchronization system
+* that do not rely on the OS to provide behaviors and instead use techniques like
+* atomic operations and spinlocks.  
+*
+* See module \ref SYNCHRONIZATION for details.
+*
+* \par Example:
+* \code
+* class MyMutex
+* {
+*     volatile long bLocked;
+*     nvtxSyncUser_t hSync;
+* public:
+*     MyMutex(const char* name, nvtxDomainHandle_t d){
+*          bLocked = 0;
+*
+*          nvtxSyncUserAttributes_t attribs = { 0 };
+*          attribs.version = NVTX_VERSION;
+*          attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*          attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+*          attribs.message.ascii = name;
+*          hSync = nvtxDomainSyncUserCreate(d, &attribs);
+*     }
+*
+*     ~MyMutex() {
+*          nvtxDomainSyncUserDestroy(hSync);
+*     }
+*
+*     bool Lock() {
+*          nvtxDomainSyncUserAcquireStart(hSync);
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic 
+
+*          if (acquired) {
+*              nvtxDomainSyncUserAcquireSuccess(hSync);
+*          }
+*          else {
+*              nvtxDomainSyncUserAcquireFailed(hSync);
+*          }
+*          return acquired;
+*     }
+
+*     void Unlock() {
+*          nvtxDomainSyncUserReleasing(hSync);
+*          bLocked = false;
+*     }
+* };
+* \endcode
+* 
+* \version \NVTX_VERSION_2
+*/
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
+#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
+/** \endcond */
+
+
+/*  ------------------------------------------------------------------------- */
+/** \defgroup SYNCHRONIZATION Synchronization
+* See page \ref PAGE_SYNCHRONIZATION.
+* @{
+*/
+
+/** \brief Resource type values for OSs with POSIX Thread API support
+ */
+typedef enum nvtxResourceSyncPosixThreadType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t  */
+} nvtxResourceSyncPosixThreadType_t;
+
+/** \brief Resource type values for Windows OSs
+*/
+typedef enum nvtxResourceSyncWindowsType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
+} nvtxResourceSyncWindowsType_t;
+
+/** \brief Resource type values for Linux and Linux derived OSs such as Android
+* \sa
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
+} nvtxResourceSyncLinuxType_t;
+
+/** \brief Resource type values for Android come from Linux.
+* \sa
+* ::nvtxResourceSyncLinuxType_t
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
+
+/** \brief User Defined Synchronization Object Handle .
+* \anchor SYNCUSER_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a user defined syncrhonization object.  The tools will return a pointer through the API for the application
+* to hold on it's behalf to reference the string in the future.
+*
+*/
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+
+/** \brief User Defined Synchronization Object Attributes Structure.
+* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
+*
+* This structure is used to describe the attributes of a user defined synchronization 
+* object.  The layout of the structure is defined by a specific version of the tools 
+* extension library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the event attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = 1;
+* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* // Initialize
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Example";
+* \endcode
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+*/
+typedef struct nvtxSyncUserAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure.
+    *
+    * The text message that is attached to an event.
+    */
+    nvtxMessageValue_t message;
+
+} nvtxSyncUserAttributes_v0;
+
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+
+/* ------------------------------------------------------------------------- */
+/** \brief Create a user defined synchronization object 
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param domain - Domain to own the resource
+* \param attribs - A structure to assign multiple attributes to the object.
+*
+* \return A handle that represents the newly created user defined synchronization object.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of failure in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart
+* 
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of success in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of releasing a reservation on user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
+
+
+/** @} */ /*END defgroup*/
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplSync_v3.h"
+#undef NVTX_IMPL_GUARD_SYNC
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_SYNC_V3 */
@@ -0,0 +1,438 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32) 
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h> 
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#define NVTX_INIT_STATE_FRESH 0
+#define NVTX_INIT_STATE_STARTED 1
+#define NVTX_INIT_STATE_COMPLETE 2
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+
+/* ---- Forward declare all functions referenced in globals ---- */
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
+    uint32_t version);
+NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
+    uint32_t exportTableId);
+
+#include "nvtxInitDecls.h"
+
+/* ---- Define all globals ---- */
+
+typedef struct nvtxGlobals_t
+{
+    volatile unsigned int initState;
+    NvtxExportTableCallbacks etblCallbacks;
+    NvtxExportTableVersionInfo etblVersionInfo;
+
+    /* Implementation function pointers */
+    nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
+    nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
+    nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
+    nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
+    nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
+    nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
+    nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
+    nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
+    nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
+    nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
+    nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
+    nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
+    nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
+    nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
+    nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
+
+    nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
+    nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
+    nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
+    nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
+    nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
+    nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
+    nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
+    nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
+
+    nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
+    nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
+    nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
+    nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
+    nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
+    nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
+    nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
+    nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
+    nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
+    nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
+    nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
+    nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
+    nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
+    nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
+
+    nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
+    nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
+    nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
+    nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
+    nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
+    nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
+
+    nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
+    nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
+    nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
+    nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
+    nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
+    nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
+    nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
+    nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
+    nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
+    nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
+    nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
+    nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
+    nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
+    nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
+    nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
+
+    nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
+    nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
+    nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
+
+    /* Tables of function pointers -- Extra null added to the end to ensure
+    *  a crash instead of silent corruption if a tool reads off the end. */
+    NvtxFunctionPointer* functionTable_CORE  [NVTX_CBID_CORE_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_CUDA  [NVTX_CBID_CUDA_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE  + 1];
+    NvtxFunctionPointer* functionTable_SYNC  [NVTX_CBID_SYNC_SIZE   + 1];
+} nvtxGlobals_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
+{
+    NVTX_INIT_STATE_FRESH,
+
+    {
+        sizeof(NvtxExportTableCallbacks),
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
+    },
+    {
+        sizeof(NvtxExportTableVersionInfo),
+        NVTX_VERSION,
+        0,
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
+    },
+
+    /* Implementation function pointers */
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
+
+    /* Tables of function pointers */
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
+        0
+    }
+};
+
+/* ---- Define static inline implementations of core API functions ---- */
+
+#include "nvtxImplCore.h"
+
+/* ---- Define implementations of export table functions ---- */
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size)
+{
+    unsigned int bytes = 0;
+    NvtxFunctionTable table = (NvtxFunctionTable)0;
+
+    switch (module)
+    {
+    case NVTX_CB_MODULE_CORE:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
+        break;
+    case NVTX_CB_MODULE_CUDA:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
+        break;
+    case NVTX_CB_MODULE_OPENCL:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
+        break;
+    case NVTX_CB_MODULE_CUDART:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
+        break;
+    case NVTX_CB_MODULE_CORE2:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
+        break;
+    case NVTX_CB_MODULE_SYNC:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
+        break;
+    default: return 0;
+    }
+
+    if (out_size)
+        *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
+
+    if (out_table)
+        *out_table = table;
+
+    return 1;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
+{
+    switch (exportTableId)
+    {
+    case NVTX_ETID_CALLBACKS:       return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
+    case NVTX_ETID_VERSIONINFO:     return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
+    default:                        return 0;
+    }
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
+{
+    /* Reserved for custom implementations to resolve problems with tools */
+    (void)version;
+}
+
+/* ---- Define implementations of init versions of all API functions ---- */
+
+#include "nvtxInitDefs.h"
+
+/* ---- Define implementations of initialization functions ---- */
+
+#include "nvtxInit.h"
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,307 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
+    if(local!=0)
+        (*local)(eventAttrib);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
+    if(local!=0)
+        (*local)(message);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
+    if(local!=0)
+        (*local)(message);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
+    if(local!=0)
+        (*local)(id);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
+    if(local!=0)
+        return (*local)();
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
+    if(local!=0)
+        (*local)(category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
+    if(local!=0)
+        (*local)(category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
+    if(local!=0)
+        (*local)(threadId, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
+    if(local!=0)
+        (*local)(threadId, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, eventAttrib);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, id);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, attribs);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxResourceHandle_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(resource);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, string);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxStringHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, string);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxStringHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxDomainHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxDomainHandle_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(domain);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
+{
+#ifndef NVTX_DISABLE
+    nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
+    if(local!=0)
+        (*local)(reserved);
+#endif /*NVTX_DISABLE*/
+}
@@ -0,0 +1,81 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_CUDART
+#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
+typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
@@ -0,0 +1,102 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_CUDA
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
@@ -0,0 +1,161 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_OPENCL
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
+typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
+typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
+typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
+typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
+typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
+typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
+    if(local!=0)
+        (*local)(command_queue, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
+    if(local!=0)
+        (*local)(command_queue, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
+    if(local!=0)
+        (*local)(memobj, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
+    if(local!=0)
+        (*local)(memobj, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
+    if(local!=0)
+        (*local)(sampler, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
+    if(local!=0)
+        (*local)(sampler, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
+    if(local!=0)
+        (*local)(program, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
+    if(local!=0)
+        (*local)(program, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(evnt, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(evnt, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,83 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_SYNC
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
+
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, attribs);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxSyncUser_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,312 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/* ---- Platform-independent helper definitions and functions ---- */
+
+/* Prefer macros over inline functions to reduce symbol resolution at link time */
+
+#if defined(_WIN32) 
+#define NVTX_PATHCHAR   wchar_t
+#define NVTX_STR(x)     L##x
+#define NVTX_GETENV     _wgetenv
+#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_DLLHANDLE  HMODULE
+#define NVTX_DLLOPEN(x) LoadLibraryW(x)
+#define NVTX_DLLFUNC    GetProcAddress
+#define NVTX_DLLCLOSE   FreeLibrary
+#define NVTX_YIELD()    SwitchToThread()
+#define NVTX_MEMBAR()   MemoryBarrier()
+#define NVTX_ATOMIC_WRITE_32(address, value)                        InterlockedExchange((volatile LONG*)address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
+#elif defined(__GNUC__)
+#define NVTX_PATHCHAR   char
+#define NVTX_STR(x)     x
+#define NVTX_GETENV     getenv
+#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_DLLHANDLE  void*
+#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
+#define NVTX_DLLFUNC    dlsym
+#define NVTX_DLLCLOSE   dlclose
+#define NVTX_YIELD()    sched_yield()
+#define NVTX_MEMBAR()   __sync_synchronize()
+/* Ensure full memory barrier for atomics, to match Windows functions */
+#define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#else
+#error The library does not support your configuration!
+#endif
+
+/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
+#if defined(_WIN32)
+/* TODO */
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#else
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#endif
+
+/* Define this to 1 for platforms that support environment variables */
+/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
+/* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
+#define NVTX_SUPPORT_ENV_VARS 1
+
+/* Define this to 1 for platforms that support dynamic/shared libraries */
+#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
+
+/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
+*  and this will override any dynamic injection.  Useful for platforms where dynamic
+*  injection is not available.  Since weak symbols not explicitly marked extern are
+*  guaranteed to be initialized to zero if no definitions are found by the linker, the
+*  dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
+#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
+/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
+*  symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
+*  does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
+*  injection library. */
+__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
+#else
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
+#endif
+
+/* This function tries to find or load an NVTX injection library and get the
+*  address of its InitializeInjection2 function.  If such a function pointer
+*  is found, it is called, and passed the address of this NVTX instance's
+*  nvtxGetExportTable function, so the injection can attach to this instance.
+*  If the initialization fails for any reason, any dynamic library loaded will
+*  be freed, and all NVTX implementation functions will be set to no-ops.  If
+*  initialization succeeds, NVTX functions not attached to the tool will be set
+*  to no-ops.  This is implemented as one function instead of several small
+*  functions to minimize the number of weak symbols the linker must resolve.
+*  Order of search is:
+*  - Pre-injected library exporting InitializeInjectionNvtx2
+*  - Loadable library exporting InitializeInjectionNvtx2
+*      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+*      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+*  - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+*/
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
+{
+    const char* const initFuncName = "InitializeInjectionNvtx2";
+    NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
+    NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
+    int entryPointStatus = 0;
+
+#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
+    /* Use POSIX global symbol chain to query for init function from any module */
+    init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
+#endif
+
+#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
+    /* Try discovering dynamic injection library to load */
+    if (!init_fnptr)
+    {
+#if NVTX_SUPPORT_ENV_VARS
+        /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
+        *  to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+        const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
+            ? NVTX_STR("NVTX_INJECTION32_PATH")
+            : NVTX_STR("NVTX_INJECTION64_PATH");
+#endif /* NVTX_SUPPORT_ENV_VARS */
+        NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
+        const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
+
+        /* Refer to this variable explicitly in case all references to it are #if'ed out */
+        (void)injectionLibraryPathBuf;
+
+#if NVTX_SUPPORT_ENV_VARS
+        /* Disable the warning for getenv & _wgetenv -- this usage is safe because
+        *  these functions are not called again before using the returned value. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif
+        injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+#endif
+
+#if defined(__ANDROID__)
+        if (!injectionLibraryPath)
+        {
+            const char *bits = (sizeof(void*) == 4) ? "32" : "64";
+            char cmdlineBuf[32];
+            char pkgName[PATH_MAX];
+            int count;
+            int pid;
+            FILE *fp;
+            size_t bytesRead;
+            size_t pos;
+
+            pid = (int)getpid();
+            count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
+            if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
+            {
+                NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            fp = fopen(cmdlineBuf, "r");
+            if (!fp)
+            {
+                NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
+            fclose(fp);
+            if (bytesRead == 0)
+            {
+                NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            pkgName[bytesRead] = 0;
+
+            /* String can contain colon as a process separator. In this case the package name is before the colon. */
+            pos = 0;
+            while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
+            {
+                ++pos;
+            }
+            pkgName[pos] = 0;
+
+            count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
+            if (count <= 0 || count >= NVTX_BUFSIZE)
+            {
+                NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            /* On Android, verify path is accessible due to aggressive file access restrictions. */
+            /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
+            /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
+            if (injectionLibraryPathBuf[0] == '/')
+            {
+#if (__ANDROID_API__ < 21)
+                int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
+#else
+                int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
+#endif
+                if (access_err != 0)
+                {
+                    NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
+                    return NVTX_ERR_INIT_ACCESS_LIBRARY;
+                }
+            }
+            injectionLibraryPath = injectionLibraryPathBuf;
+        }
+#endif
+
+        /* At this point, injectionLibraryPath is specified if a dynamic
+        *  injection library was specified by a tool. */
+        if (injectionLibraryPath)
+        {
+            /* Load the injection library */
+            injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
+            if (!injectionLibraryHandle)
+            {
+                NVTX_ERR("Failed to load injection library\n");
+                return NVTX_ERR_INIT_LOAD_LIBRARY;
+            }
+            else
+            {
+                /* Attempt to get the injection library's entry-point */
+                init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
+                if (!init_fnptr)
+                {
+                    NVTX_DLLCLOSE(injectionLibraryHandle);
+                    NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
+                    return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
+                }
+            }
+        }
+    }
+#endif
+
+#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
+    if (!init_fnptr)
+    {
+        /* Check weakly-defined function pointer.  A statically-linked injection can define this as
+        *  a normal symbol and it will take precedence over a dynamic injection. */
+        if (InitializeInjectionNvtx2_fnptr)
+        {
+            init_fnptr = InitializeInjectionNvtx2_fnptr;
+        }
+    }
+#endif
+
+    /* At this point, if init_fnptr is not set, then no tool has specified
+    *  an NVTX injection library -- return non-success result so all NVTX
+    *  API functions will be set to no-ops. */
+    if (!init_fnptr)
+    {
+        return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
+    }
+
+    /* Invoke injection library's initialization function.  If it returns
+    *  0 (failure) and a dynamic injection was loaded, unload it. */
+    entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
+    if (entryPointStatus == 0)
+    {
+        NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
+        if (injectionLibraryHandle)
+        {
+            NVTX_DLLCLOSE(injectionLibraryHandle);
+        }
+        return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
+    }
+
+    return NVTX_SUCCESS;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
+{
+    unsigned int old;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
+    {
+        return;
+    }
+
+    NVTX_ATOMIC_CAS_32(
+        old,
+        &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
+        NVTX_INIT_STATE_STARTED,
+        NVTX_INIT_STATE_FRESH);
+    if (old == NVTX_INIT_STATE_FRESH)
+    {
+        int result;
+        int forceAllToNoops;
+
+        /* Load & initialize injection library -- it will assign the function pointers */
+        result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
+
+        /* Set all pointers not assigned by the injection to null */
+        forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
+        NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
+
+        /* Signal that initialization has finished, so now the assigned function pointers will be used */
+        NVTX_ATOMIC_WRITE_32(
+            &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
+            NVTX_INIT_STATE_COMPLETE);
+    }
+    else /* Spin-wait until initialization has finished */
+    {
+        NVTX_MEMBAR();
+        while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
+        {
+            NVTX_YIELD();
+            NVTX_MEMBAR();
+        }
+    }
+}
@@ -0,0 +1,81 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
@@ -0,0 +1,573 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxRangeEnd(id);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePop();
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameCategoryA(category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameCategoryW(category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameOsThreadA(threadId, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameOsThreadW(threadId, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainMarkEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangeStartEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainRangeEnd(domain, id);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangePushEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangePop(domain);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainResourceCreate(domain, attribs);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainResourceDestroy(resource);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainNameCategoryA(domain, category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainNameCategoryW(domain, category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRegisterStringA(domain, string);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRegisterStringW(domain, string);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainCreateA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainCreateW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainDestroy(domain);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxInitialize(reserved);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
+    nvtxNameCuDeviceA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
+    nvtxNameCuDeviceW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
+    nvtxNameCuContextA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
+    nvtxNameCuContextW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
+    nvtxNameCuStreamA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
+    nvtxNameCuStreamW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
+    nvtxNameCuEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
+    nvtxNameCuEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
+    nvtxNameCudaDeviceA_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
+    nvtxNameCudaDeviceW_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
+    nvtxNameCudaStreamA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
+    nvtxNameCudaStreamW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
+    nvtxNameCudaEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
+    nvtxNameCudaEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
+    nvtxNameClDeviceA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
+    nvtxNameClDeviceW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
+    nvtxNameClContextA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
+    nvtxNameClContextW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
+    nvtxNameClCommandQueueA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
+    if (local)
+        local(command_queue, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
+    nvtxNameClCommandQueueW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
+    if (local)
+        local(command_queue, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
+    nvtxNameClMemObjectA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
+    if (local)
+        local(memobj, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
+    nvtxNameClMemObjectW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
+    if (local)
+        local(memobj, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
+    nvtxNameClSamplerA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
+    if (local)
+        local(sampler, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
+    nvtxNameClSamplerW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
+    if (local)
+        local(sampler, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
+    nvtxNameClProgramA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
+    if (local)
+        local(program, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
+    nvtxNameClProgramW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
+    if (local)
+        local(program, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
+    nvtxNameClEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
+    if (local)
+        local(evnt, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
+    nvtxNameClEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
+    if (local)
+        local(evnt, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
+    nvtxDomainSyncUserCreate_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
+    if (local) {
+        return local(domain, attribs);
+    }
+    return (nvtxSyncUser_t)0;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserDestroy_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireStart_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireFailed_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserReleasing_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
+{
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
+}
@@ -0,0 +1,83 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef __NVTX_LINKONCE_H__
+#define __NVTX_LINKONCE_H__
+
+/* This header defines macros to permit making definitions of global variables
+ * and functions in C/C++ header files which may be included multiple times in
+ * a translation unit or linkage unit.  It allows authoring header-only libraries
+ * which can be used by multiple other header-only libraries (either as the same
+ * copy or multiple copies), and does not require any build changes, such as
+ * adding another .c file, linking a static library, or deploying a dynamic
+ * library.  Globals defined with these macros have the property that they have
+ * the same address, pointing to a single instance, for the entire linkage unit.
+ * It is expected but not guaranteed that each linkage unit will have a separate
+ * instance.
+ *
+ * In some situations it is desirable to declare a variable without initializing
+ * it, refer to it in code or other variables' initializers, and then initialize
+ * it later.  Similarly, functions can be prototyped, have their address taken,
+ * and then have their body defined later.  In such cases, use the FWDDECL macros 
+ * when forward-declaring LINKONCE global variables without initializers and
+ * function prototypes, and then use the DEFINE macros when later defining them.
+ * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
+ * following this pattern makes code maximally portable.
+ */
+
+#if defined(__MINGW32__) /* MinGW */
+    #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#elif defined(_MSC_VER) /* MSVC */
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   extern "C" __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION __inline
+    #endif
+#elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
+    #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#elif defined(__CYGWIN__) /* Assume GCC or compatible */
+    #define NVTX_LINKONCE_WEAK __attribute__((weak))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#else /* All others: Assume GCC, clang, or compatible */
+    #define NVTX_LINKONCE_WEAK   __attribute__((weak))
+    #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+    #endif
+#endif
+
+#define NVTX_LINKONCE_FWDDECL_GLOBAL   NVTX_LINKONCE_DEFINE_GLOBAL   extern
+#define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
+
+#endif /* __NVTX_LINKONCE_H__ */
@@ -0,0 +1,304 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/* This header defines types which are used by the internal implementation
+*  of NVTX and callback subscribers.  API clients do not use these types,
+*  so they are defined here instead of in nvToolsExt.h to clarify they are
+*  not part of the NVTX client API. */
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h.
+#endif
+
+/* ------ Dependency-free types binary-compatible with real types ------- */
+
+/* In order to avoid having the NVTX core API headers depend on non-NVTX
+*  headers like cuda.h, NVTX defines binary-compatible types to use for
+*  safely making the initialization versions of all NVTX functions without
+*  needing to have definitions for the real types. */
+
+typedef int   nvtx_CUdevice;
+typedef void* nvtx_CUcontext;
+typedef void* nvtx_CUstream;
+typedef void* nvtx_CUevent;
+
+typedef void* nvtx_cudaStream_t;
+typedef void* nvtx_cudaEvent_t;
+
+typedef void* nvtx_cl_platform_id;
+typedef void* nvtx_cl_device_id;
+typedef void* nvtx_cl_context;
+typedef void* nvtx_cl_command_queue;
+typedef void* nvtx_cl_mem;
+typedef void* nvtx_cl_program;
+typedef void* nvtx_cl_kernel;
+typedef void* nvtx_cl_event;
+typedef void* nvtx_cl_sampler;
+
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+struct nvtxSyncUserAttributes_v0;
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+
+/* --------- Types for function pointers (with fake API types) ---------- */
+
+typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message);
+typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message);
+typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id);
+typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message);
+typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message);
+typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void);
+typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name);
+typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name);
+typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */
+typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */
+typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name);
+typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name);
+typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name);
+typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name);
+typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name);
+typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name);
+typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */
+typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
+typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name);
+
+typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain);
+typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource);
+typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string);
+typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string);
+typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message);
+typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message);
+typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain);
+typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved);
+
+typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
+
+/* ---------------- Types for callback subscription --------------------- */
+
+typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId);
+typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable);
+
+typedef enum NvtxCallbackModule
+{
+    NVTX_CB_MODULE_INVALID                 = 0,
+    NVTX_CB_MODULE_CORE                    = 1,
+    NVTX_CB_MODULE_CUDA                    = 2,
+    NVTX_CB_MODULE_OPENCL                  = 3,
+    NVTX_CB_MODULE_CUDART                  = 4,
+    NVTX_CB_MODULE_CORE2                   = 5,
+    NVTX_CB_MODULE_SYNC                    = 6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CB_MODULE_SIZE,
+    NVTX_CB_MODULE_FORCE_INT               = 0x7fffffff
+} NvtxCallbackModule;
+
+typedef enum NvtxCallbackIdCore
+{
+    NVTX_CBID_CORE_INVALID                 =  0,
+    NVTX_CBID_CORE_MarkEx                  =  1,
+    NVTX_CBID_CORE_MarkA                   =  2,
+    NVTX_CBID_CORE_MarkW                   =  3,
+    NVTX_CBID_CORE_RangeStartEx            =  4,
+    NVTX_CBID_CORE_RangeStartA             =  5,
+    NVTX_CBID_CORE_RangeStartW             =  6,
+    NVTX_CBID_CORE_RangeEnd                =  7,
+    NVTX_CBID_CORE_RangePushEx             =  8,
+    NVTX_CBID_CORE_RangePushA              =  9,
+    NVTX_CBID_CORE_RangePushW              = 10,
+    NVTX_CBID_CORE_RangePop                = 11,
+    NVTX_CBID_CORE_NameCategoryA           = 12,
+    NVTX_CBID_CORE_NameCategoryW           = 13,
+    NVTX_CBID_CORE_NameOsThreadA           = 14,
+    NVTX_CBID_CORE_NameOsThreadW           = 15,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CORE_SIZE,
+    NVTX_CBID_CORE_FORCE_INT = 0x7fffffff
+} NvtxCallbackIdCore;
+
+typedef enum NvtxCallbackIdCore2
+{
+    NVTX_CBID_CORE2_INVALID                 = 0,
+    NVTX_CBID_CORE2_DomainMarkEx            = 1,
+    NVTX_CBID_CORE2_DomainRangeStartEx      = 2,
+    NVTX_CBID_CORE2_DomainRangeEnd          = 3,
+    NVTX_CBID_CORE2_DomainRangePushEx       = 4,
+    NVTX_CBID_CORE2_DomainRangePop          = 5,
+    NVTX_CBID_CORE2_DomainResourceCreate    = 6,
+    NVTX_CBID_CORE2_DomainResourceDestroy   = 7,
+    NVTX_CBID_CORE2_DomainNameCategoryA     = 8,
+    NVTX_CBID_CORE2_DomainNameCategoryW     = 9,
+    NVTX_CBID_CORE2_DomainRegisterStringA   = 10,
+    NVTX_CBID_CORE2_DomainRegisterStringW   = 11,
+    NVTX_CBID_CORE2_DomainCreateA           = 12,
+    NVTX_CBID_CORE2_DomainCreateW           = 13,
+    NVTX_CBID_CORE2_DomainDestroy           = 14,
+    NVTX_CBID_CORE2_Initialize              = 15,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CORE2_SIZE,
+    NVTX_CBID_CORE2_FORCE_INT               = 0x7fffffff
+} NvtxCallbackIdCore2;
+
+typedef enum NvtxCallbackIdCuda
+{
+    NVTX_CBID_CUDA_INVALID                 =  0,
+    NVTX_CBID_CUDA_NameCuDeviceA           =  1,
+    NVTX_CBID_CUDA_NameCuDeviceW           =  2,
+    NVTX_CBID_CUDA_NameCuContextA          =  3,
+    NVTX_CBID_CUDA_NameCuContextW          =  4,
+    NVTX_CBID_CUDA_NameCuStreamA           =  5,
+    NVTX_CBID_CUDA_NameCuStreamW           =  6,
+    NVTX_CBID_CUDA_NameCuEventA            =  7,
+    NVTX_CBID_CUDA_NameCuEventW            =  8,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CUDA_SIZE,
+    NVTX_CBID_CUDA_FORCE_INT               = 0x7fffffff
+} NvtxCallbackIdCuda;
+
+typedef enum NvtxCallbackIdCudaRt
+{
+    NVTX_CBID_CUDART_INVALID               =  0,
+    NVTX_CBID_CUDART_NameCudaDeviceA       =  1,
+    NVTX_CBID_CUDART_NameCudaDeviceW       =  2,
+    NVTX_CBID_CUDART_NameCudaStreamA       =  3,
+    NVTX_CBID_CUDART_NameCudaStreamW       =  4,
+    NVTX_CBID_CUDART_NameCudaEventA        =  5,
+    NVTX_CBID_CUDART_NameCudaEventW        =  6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CUDART_SIZE,
+    NVTX_CBID_CUDART_FORCE_INT             = 0x7fffffff
+} NvtxCallbackIdCudaRt;
+
+typedef enum NvtxCallbackIdOpenCL
+{
+    NVTX_CBID_OPENCL_INVALID               =  0,
+    NVTX_CBID_OPENCL_NameClDeviceA         =  1,
+    NVTX_CBID_OPENCL_NameClDeviceW         =  2,
+    NVTX_CBID_OPENCL_NameClContextA        =  3,
+    NVTX_CBID_OPENCL_NameClContextW        =  4,
+    NVTX_CBID_OPENCL_NameClCommandQueueA   =  5,
+    NVTX_CBID_OPENCL_NameClCommandQueueW   =  6,
+    NVTX_CBID_OPENCL_NameClMemObjectA      =  7,
+    NVTX_CBID_OPENCL_NameClMemObjectW      =  8,
+    NVTX_CBID_OPENCL_NameClSamplerA        =  9,
+    NVTX_CBID_OPENCL_NameClSamplerW        = 10,
+    NVTX_CBID_OPENCL_NameClProgramA        = 11,
+    NVTX_CBID_OPENCL_NameClProgramW        = 12,
+    NVTX_CBID_OPENCL_NameClEventA          = 13,
+    NVTX_CBID_OPENCL_NameClEventW          = 14,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_OPENCL_SIZE,
+    NVTX_CBID_OPENCL_FORCE_INT             = 0x7fffffff
+} NvtxCallbackIdOpenCL;
+
+typedef enum NvtxCallbackIdSync
+{
+    NVTX_CBID_SYNC_INVALID                      = 0,
+    NVTX_CBID_SYNC_DomainSyncUserCreate         = 1,
+    NVTX_CBID_SYNC_DomainSyncUserDestroy        = 2,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireStart   = 3,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireFailed  = 4,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5,
+    NVTX_CBID_SYNC_DomainSyncUserReleasing      = 6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_SYNC_SIZE,
+    NVTX_CBID_SYNC_FORCE_INT                    = 0x7fffffff
+} NvtxCallbackIdSync;
+
+/* IDs for NVTX Export Tables */
+typedef enum NvtxExportTableID
+{
+    NVTX_ETID_INVALID                      = 0,
+    NVTX_ETID_CALLBACKS                    = 1,
+    NVTX_ETID_RESERVED0                    = 2,
+    NVTX_ETID_VERSIONINFO                  = 3,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_ETID_SIZE,
+    NVTX_ETID_FORCE_INT                    = 0x7fffffff
+} NvtxExportTableID;
+
+typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */
+typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */
+
+typedef struct NvtxExportTableCallbacks
+{
+    size_t struct_size;
+
+    /* returns an array of pointer to function pointers*/
+    int (NVTX_API *GetModuleFunctionTable)(
+        NvtxCallbackModule module,
+        NvtxFunctionTable* out_table,
+        unsigned int* out_size);
+} NvtxExportTableCallbacks;
+
+typedef struct NvtxExportTableVersionInfo
+{
+    /* sizeof(NvtxExportTableVersionInfo) */
+    size_t struct_size;
+
+    /* The API version comes from the NVTX library linked to the app.  The
+    * injection library is can use this info to make some assumptions */
+    uint32_t version;
+
+    /* Reserved for alignment, do not use */
+    uint32_t reserved0;
+
+    /* This must be set by tools when attaching to provide applications
+    *  the ability to, in emergency situations, detect problematic tools
+    *  versions and modify the NVTX source to prevent attaching anything
+    *  that causes trouble in the app.  Currently, this value is ignored. */
+    void (NVTX_API *SetInjectionNvtxVersion)(
+        uint32_t version);
+} NvtxExportTableVersionInfo;
+
+
+
+
+
+
+
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVTX_STUB_H_
+#define NCCL_NVTX_STUB_H_
+
+struct nccl_domain{static constexpr char const* name{"NCCL"};};
+
+#define NVTX3_FUNC_RANGE_IN(domain)
+#define nvtxNameOsThreadA(syscall, thread)
+
+#endif
@@ -10,23 +10,34 @@
 #define NCCL_P2P_H_

 struct ncclP2Pinfo {
- const void* sendbuff;
-  void* recvbuff;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-};
-
-struct ncclP2PConnect {
-  int nrecv[MAXCHANNELS];
-  int nsend[MAXCHANNELS];
-  int* recv;
-  int* send;
+  void* buff;
+  ssize_t nbytes;
+  struct ncclP2Pinfo* next;
 };

 struct ncclP2Plist {
-  struct ncclP2Pinfo *peerlist;
-  int count;
-  struct ncclP2PConnect connect;
+  struct ncclP2Pinfo *head;
+  struct ncclP2Pinfo *tail;
 };

+static ncclResult_t enqueueP2pInfo(ncclP2Plist* p2p, void* buff, ssize_t nBytes) {
+  if (p2p == NULL) return ncclInternalError;
+  struct ncclP2Pinfo* next;
+  NCCLCHECK(ncclCalloc(&next, 1));
+  next->buff = buff;
+  next->nbytes = nBytes;
+  if (p2p->tail != NULL) p2p->tail->next = next;
+  p2p->tail = next;
+  if (p2p->head == NULL) p2p->head = next;
+  return ncclSuccess;
+}
+
+static ncclResult_t dequeueP2pInfo(ncclP2Plist* p2p) {
+  if (p2p == NULL) return ncclInternalError;
+  struct ncclP2Pinfo* temp = p2p->head;
+  p2p->head = p2p->head->next;
+  if (p2p->tail == temp) p2p->tail = NULL;
+  free(temp);
+  return ncclSuccess;
+}
 #endif
@@ -31,10 +31,11 @@ static void setEnvFile(const char* fileName) {
    int s=0; // Env Var Size
    while (line[s] != '\0' && line[s] != '=') s++;
    if (line[s] == '\0') continue;
-    strncpy(envVar, line, std::min(1024,s));
+    strncpy(envVar, line, std::min(1023,s));
    envVar[s] = '\0';
    s++;
-    strncpy(envValue, line+s, 1024);
+    strncpy(envValue, line+s, 1023);
+    envValue[1023]='\0';
    setenv(envVar, envValue, 0);
  }
  if (line) free(line);
@@ -18,18 +18,23 @@ struct ncclProxyArgs {
  proxyProgressFunc_t progress;
  struct ncclChannel* channel;
  struct ncclConnector* connector;
+  size_t sendbytes;
+  size_t recvbytes;
  int sliceSteps;
  int chunkSteps;
  int nsteps;
  uint64_t opCount;
  int protocol;
+  int segment; // Only for profiling
  ncclDataType_t dtype;
  ncclRedOp_t redOp;
  int state;   // add component before this line -- it is left out during initialization

  // Internal state
-  uint64_t head;
-  uint64_t tail;
+  uint64_t posted;
+  uint64_t received; // Only used by recv proxy to wait for flush.
+  uint64_t transmitted;
+  uint64_t done;
  uint64_t end;
  void* requests[NCCL_STEPS];
  int idle;
@@ -38,14 +43,30 @@ struct ncclProxyArgs {
  pthread_mutex_t mutex;
  struct ncclProxyArgs* next;
  struct ncclProxyArgs* nextPeer;
+  struct ncclProxyArgs* nextGroup;
+  struct ncclProxyArgs** proxyAppendPtr;
+};
+
+struct ncclProxySharedBuffers {
+  int nslots;
+  int slotSize;
+  char* cudaBuff[2*MAXCHANNELS];
+  int* cudaUsed[2*MAXCHANNELS];
+  char* hostBuff[2*MAXCHANNELS];
+  int* hostUsed[2*MAXCHANNELS];
+  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
 };

 struct ncclProxyPool;
 struct ncclProxyState {
  pthread_cond_t cond;
-  pthread_mutex_t mutex;
+  pthread_mutex_t opsMutex;
+  pthread_mutex_t poolMutex;
  bool stop;
+  struct ncclProxySharedBuffers* sharedBuffs;
  struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* nextOps;
+  struct ncclProxyArgs* nextOpsEnd;
  struct ncclProxyArgs* pool;
  struct ncclProxyPool* pools;
 };
@@ -59,12 +80,16 @@ enum proxyMode {
 };

 ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
-ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info);
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);

+ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
+ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
+ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
+ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
+
 #include <unistd.h>

 // Spin wait until func evaluates to true
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -21,6 +21,7 @@
 #define SLEEP_INT            1000 // connection retry sleep interval in usec
 #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
 #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
+#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)

 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -64,7 +65,7 @@ static inline int envSocketFamily(void) {

 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
 #endif
  struct netIf userIfs[MAX_IFS];
  bool searchNot = prefixList && prefixList[0] == '^';
@@ -167,9 +168,9 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {

 static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
 #endif
-  char line_a[1024];
+  char line_a[SOCKET_NAME_MAXLEN+1];
  int found = 0;
  struct ifaddrs *interfaces, *interface;
  getifaddrs(&interfaces);
@@ -355,7 +356,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");

 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif

@@ -370,6 +371,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
 static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
  /* IPv4/IPv6 support */
  int family = remoteAddr->sa.sa_family;
+  if (family != AF_INET && family != AF_INET6) {
+    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
+    return ncclInternalError;
+  }
  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);

  /* Connect to a hostname / port */
@@ -386,10 +391,8 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/

-  char line[1024];
-#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
-#endif

  int ret;
  int timedout_retries = 0;
@@ -450,7 +453,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
  return ncclSuccess;
 }

-static ncclResult_t socketReceive(int fd, void* ptr, int size) {
+static ncclResult_t socketRecv(int fd, void* ptr, int size) {
  int offset = 0;
  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, ptr, size, &offset));
  return ncclSuccess;
@@ -41,8 +41,8 @@ struct ncclConnect {
 };

 struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
-  ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
+  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
+  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
  ncclResult_t (*free)(void*);
  ncclResult_t (*proxy)(struct ncclProxyArgs*);
 };
@@ -54,6 +54,7 @@ struct ncclTransport {
  struct ncclTransportComm recv;
 };

-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);

 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_TREES_H_
 #define NCCL_TREES_H_

-ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
-ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);

 #endif
@@ -41,7 +41,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif

-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "Gather", "Scatter", "AllToAll", "AllToAllv" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };

@@ -160,47 +160,67 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
 #ifdef ENABLE_COLLTRACE
 void *ncclCommThreadMain(void *arg) {
  ncclComm_t comm = (ncclComm_t)arg;
+  int head = comm->hostDevComm.collTraceHead;
  do {
    int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
-    int head = comm->hostDevComm.collTraceHead;
    int count;
    if (head <= tail)
      count = tail - head;
    else
      count = COLLTRACE_NUM_ITEMS + head - tail;
-    usleep(1000); //sleep 1ms
+    if (!count) {
+      if(LOAD(&comm->hostDevComm.collTraceExit))
+        break;
+      else {
+        usleep(1000); //sleep 1ms
+        continue;
+      }
+    }
    for (int i = 0; i < count; i++) {
+      uint8_t type = LOAD(&(comm->hostDevComm.collTrace[head].type));
+      if (type == ncclCollTraceNotReady)
+        break;
      char line[1024];
      int offset = 0;
      #define VEGA_GPU_RTC_FREQUENCY 2.5E7
-      sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
-        (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
-      offset = strlen(line);
-      switch (comm->hostDevComm.collTrace[head].type) {
-        case ncclCollTraceKernelLaunchType:
-          sprintf(line+offset, " KL hwid %8x funcIndex %d",
-            comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
-          break;
-        case ncclCollTraceCollEndType:
-          if (comm->hostDevComm.collTrace[head].funcIndex != -1)
-            sprintf(line+offset, " CE next funcIndex %d",
-              comm->hostDevComm.collTrace[head].funcIndex);
-          else
-            sprintf(line+offset, " KE");
-          break;
-        case ncclCollTraceAbortType:
-          sprintf(line+offset, " Abort");
-          break;
-        default:
-          sprintf(line+offset, " unknown collective trace data type");
-          break;
+      if (type == ncclCollTraceDataType) {
+        sprintf(line, "## [%12.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
+          (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid,
+          comm->hostDevComm.collTrace[head].funcIndex,
+          comm->hostDevComm.collTrace[head].data_0,
+          comm->hostDevComm.collTrace[head].opCount,
+          comm->hostDevComm.collTrace[head].data_1);
+      } else {
+        sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
+          (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
+        offset = strlen(line);
+        switch (type) {
+          case ncclCollTraceKernelLaunchType:
+            sprintf(line+offset, " KL hwid %8x funcIndex %d",
+              comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
+            break;
+          case ncclCollTraceCollEndType:
+            if (comm->hostDevComm.collTrace[head].funcIndex != -1)
+              sprintf(line+offset, " CE next funcIndex %d",
+                comm->hostDevComm.collTrace[head].funcIndex);
+            else
+              sprintf(line+offset, " KE");
+            break;
+          case ncclCollTraceAbortType:
+            sprintf(line+offset, " Abort");
+            break;
+          default:
+            sprintf(line+offset, " unknown collective trace data type");
+            break;
+        }
      }
      INFO(NCCL_COLL, "%s", line);
+      STORE(&(comm->hostDevComm.collTrace[head].type), ncclCollTraceNotReady);
      head ++;
      head %= COLLTRACE_NUM_ITEMS;
    }
-    comm->hostDevComm.collTraceHead = tail;
-  } while(!LOAD(&comm->hostDevComm.collTraceExit));
+  } while(1);
+  comm->hostDevComm.collTraceHead = head;
  pthread_exit(NULL);
 }
 #endif
@@ -210,9 +230,11 @@ void *ncclCommThreadMain(void *arg) {
 static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;
-  free(comm->p2plist.peerlist);
-  free(comm->p2plist.connect.recv);
-  free(comm->p2plist.connect.send);
+  free(comm->connectSend);
+  free(comm->connectRecv);
+  free(comm->p2pSends);
+  free(comm->p2pRecvs);
+  free(comm->asyncOps);

 #ifdef ENABLE_PROFILING
  struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
@@ -290,7 +312,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
    free(comm->intraCGMode);
    free(comm->intraCC);
  }
-  CUDACHECK(hipHostFree((void *)comm->abortFlag));
+  NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));

  // Poison comm to try and catch a double free
  commPoison(comm);
@@ -319,7 +341,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  struct ncclComm* comm;
  NCCLCHECK(ncclCalloc(&comm, 1));

-  comm->rank = comm->hostDevComm.rank =rank;
+  comm->rank = comm->hostDevComm.rank = rank;
  comm->nRanks = comm->hostDevComm.nRanks = ndev;
  hipGetDevice(&comm->cudaDev);
  NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
@@ -355,17 +377,25 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
    comm->hostDevComm.collTraceThread = 0;
 #endif
  comm->collNetSupport = 0;
-  comm->p2plist.count=0;
-  NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
-  for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
-  NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
-  NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
+
+  NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
+  comm->asyncOpCount = 0;
+  comm->asyncTotalSize = 0;
+
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+  comm->p2pSendCount = comm->p2pRecvCount = 0;
+  NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));

  // Mark channels as non initialized.
  for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;

-  comm->alltoallDisable = false;
-  if (rcclParamAllToAllDisable()) comm->alltoallDisable = true;
+  comm->alltoallDisable = true;
+  //if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;

  *comret = comm;
  return ncclSuccess;
@@ -373,11 +403,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {

 static ncclResult_t devCommSetup(ncclComm_t comm) {
  // Duplicate the channels on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));

  // Copy userRanks and peers
-  for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
+  for (int r=0; r<comm->p2pnChannels; r++) {
    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
  }

@@ -449,7 +479,7 @@ void* waitForNonNullPtr(void* p) {

 ncclResult_t initParams(struct ncclComm* comm) {
  hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args =(void **)&comm->argsptr;
+  params->args = (void **)&comm->argsptr;
  params->stream = NULL;
  params->sharedMem = 0;
  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -518,8 +548,8 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct

 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
-#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
-#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
+#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
+#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
 NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
 NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
 NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@@ -532,10 +562,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
  int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };

  if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
-  if (comm->nRanks >= 32) {
-    defaults[NCCL_PROTO_SIMPLE] = 524288;
-    INFO(NCCL_INIT, "Setting DEFAULT_BUFFSIZE to %d for nRanks %d", defaults[NCCL_PROTO_SIMPLE], comm->nRanks);
-  }

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
@@ -581,7 +607,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
  // setup
  struct ncclConnect myConnect;
  if (isMaster && ret > 0) {
-    NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
+    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
  }
  // prepare connect handles
  ncclResult_t res;
@@ -611,7 +637,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
  }
  // connect
  if (isMaster && ret > 0) {
-    NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
    struct ncclPeer* devRoot = channel->devPeers+nranks;
    struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
    CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
@@ -669,10 +695,9 @@ NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);

 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
-  // We use 3 AllGathers
-  // 1. { peerInfo, comm }
-  // 2. ConnectTransport[nranks], ConnectValue[nranks]
-  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+  // We use 2 AllGathers
+  // 1. { peerInfo, comm, compCap}
+  // 2. { nChannels, graphInfo, topoRanks }

  int rank = comm->rank;
  int nranks = comm->nRanks;
@@ -684,10 +709,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  struct {
    struct ncclPeerInfo peerInfo;
    struct ncclComm* comm;
+    int cudaCompCap;
  } *allGather1Data;

  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
  allGather1Data[rank].comm = comm;
+  allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
  NCCLCHECK(fillInfo(comm, myInfo, commHash));
  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
@@ -700,7 +727,40 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
      return ncclInvalidUsage;
    }
  }
-  // AllGather1 data is used again below
+
+  // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  int myCompCap = allGather1Data[rank].cudaCompCap;
+  int minCompCap = myCompCap, maxCompCap = myCompCap;
+  uint64_t otherHostHash;
+  int tmpNnodes = 1;
+  for (int i = 0; i < nranks; i++) {
+    if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
+      if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
+        if (intraRanks == 0) intraRank0 = i;
+        if (i == rank) intraRank = intraRanks;
+        intraRanks++;
+      }
+    } else {  // Determine whether number of nodes is 2 (for use in tree pattern determination)
+      if (tmpNnodes == 1) {
+        otherHostHash = allGather1Data[i].peerInfo.hostHash;
+        tmpNnodes = 2;
+      } else if (tmpNnodes == 2 && otherHostHash != allGather1Data[i].peerInfo.hostHash) {
+        tmpNnodes = 3;
+      }
+    }
+    minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
+    maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
+  }
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm;
+
  // AllGather1 - end

  // Topo detection / System graph creation
@@ -729,7 +789,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  struct ncclTopoGraph treeGraph;
  treeGraph.id = 1;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.pattern = tmpNnodes <= 2 ? NCCL_TOPO_PATTERN_TREE : NCCL_TOPO_PATTERN_BALANCED_TREE;
  treeGraph.crossNic = ncclParamCrossNic();
  treeGraph.collNet = 0;
  treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
@@ -753,10 +813,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  // AllGather3 - begin
  struct ncclGraphInfo {
+    int pattern;
    int sameChannels;
    float speedIntra;
    float speedInter;
    int typeIntra;
+    int typeInter;
  };

  struct {
@@ -776,29 +838,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
  allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
  allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
+  allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
+
  allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
    std::min(treeGraph.nChannels, ringGraph.nChannels);
-  allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
+  allGather3Data[rank].tree.pattern = treeGraph.pattern;
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
+  allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
+  allGather3Data[rank].ring.pattern = ringGraph.pattern;
  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+  allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
+  allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
  allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
  allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
+  allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;

  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));

  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));

  // Determine nNodes, firstRanks, ...
-  int* nodesFirstRank;
+  int *nodesFirstRank, *nodesTreePatterns;
  NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
+  NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
  for (int i=0; i<nranks; i++) {
    int node = -1;
    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
@@ -808,18 +878,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    if (node == -1) {
      node = comm->nNodes++;
      nodesFirstRank[node] = firstRank;
+      // Record tree pattern of each node as they can be different depending on sm arch
+      nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
    }
    if (i == comm->rank) comm->node = node;
  }

-  // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = allGather3Data[rank].cudaCompCap;
-  int minCompCap = myCompCap, maxCompCap = myCompCap;
-  for (int i = 0; i < nranks; i++) {
-    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
-    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
-  }
-
  int nChannelsOrig = comm->nChannels;
  struct ncclTopoRanks** allTopoRanks;
  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -835,15 +899,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+    treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
    collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
    collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+    collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
  }
+
  if (comm->alltoallDisable != alltoallDisable) {
    comm->alltoallDisable = alltoallDisable;
  }
@@ -873,7 +941,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, gcn, nNets));
  if (comm->nNodes > 1 &&
      ncclParamCollNetEnable() == 1 &&
      collNetSupport() && collNetGraph.nChannels) {
@@ -881,23 +949,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  }

  free(allTopoRanks);
+  free(nodesTreePatterns);
  free(nodesFirstRank);
+  free(allGather1Data);
  free(allGather3Data);

  // AllGather3 - end

  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);

-  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
-
  char line[1024];
  line[0]='\0';
  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclTree* treeUp = &comm->channels[c].treeUp;
-    struct ncclTree* treeDn = &comm->channels[c].treeDn;
-    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
-        c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
-        treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
+    struct ncclTree* tree = &comm->channels[c].tree;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+        c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
    INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
  }
  line[1023] = '\0';
@@ -913,16 +979,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(computeBuffSizes(comm));

  // Connect with prev/next for each ring
-  struct ncclConnect *connect;
-  NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
+  INFO(NCCL_INIT, "Connected all rings");
+
+  // Connect Trees
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    if (comm->nRanks == 1) continue;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
+  INFO(NCCL_INIT, "Connected all trees");

  // Check if we can setup CollNet
  if (comm->nNodes > 1 &&
@@ -935,8 +1009,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    for (int c=0; c<logicChannels; c++) {
      struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
      struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+      NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
+      NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
      const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
      const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
      if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
@@ -944,82 +1018,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
      else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
        collNetSetupFail = 1;
    }
+    NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
    // Verify CollNet setup across ranks
    NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
  }
  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
-  free(connect);
  free(rings);

+  // Compute time models for algorithm and protocol combinations
+  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+
  // Compute nChannels per peer for p2p
  NCCLCHECK(ncclTopoComputeP2pChannels(comm));

-  if (!alltoallDisable) {
-    int nc = comm->nChannels;
-    if (comm->topo->type == RCCL_TOPO_4P2H_ROME)
-      nc = 2;
-    for (int c=0; c<nc; c++) {
-      const int peersPerChan = DIVUP(nranks, nc);
-      struct ncclP2PConnect* connect = &comm->p2plist.connect;
-      connect->nrecv[c] = 0;
-      connect->nsend[c] = 0;
-      for (int p=0; p<peersPerChan; p++) {
-        // first channel is reserved for self copy
-        if ((c*peersPerChan+p)%nranks == 0)
-          continue;
-        int peerSend = (rank+c*peersPerChan+p)%nranks;
-        int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
-        if (comm->channels[c].peers[peerSend].send.connected == 0) {
-          connect->send[c*nranks+connect->nsend[c]++] = peerSend;
-        }
-        if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
-          connect->recv[c*nranks+connect->nrecv[c]++] = peerRecv;
-        }
-      }
-    }
-
-    for (int c=0; c<nc; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      struct ncclP2PConnect* connect = &comm->p2plist.connect;
-#if 0
-      printf("channel %d recv: ", c);
-      for (int i=0; i<connect->nrecv[c]; i++)
-        printf("%d ", connect->recv[c*nranks+i]);
-      printf("\n");
-      printf("channel %d send: ", c);
-      for (int i=0; i<connect->nsend[c]; i++)
-        printf("%d ", connect->send[c*nranks+i]);
-      printf("\n");
-#endif
-      NCCLCHECK(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*nranks, connect->nsend[c], connect->send+c*nranks));
-      connect->nrecv[c] = 0;
-      connect->nsend[c] = 0;
-    }
-  }
-
-  // Compute intra ranks (using AllGather1 data)
-  do {
-    int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-    for (int i = 0; i < nranks; i++) {
-      if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
-          (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
-        if (intraRanks == 0) intraRank0 = i;
-        if (i == rank) intraRank = intraRanks;
-        intraRanks++;
-      }
-    }
-    TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-    if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
-      WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-          rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-      return ncclInternalError;
-    }
-    NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
-  } while(0);
-
-  // Done with AllGather1 data
-  free(allGather1Data);
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));

  if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));

@@ -1083,6 +1095,7 @@ end:

 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  int cudaDev;
  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
@@ -1091,6 +1104,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm

 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
  if (ndev < 0) {
    WARN("Invalid device count requested : %d", ndev);
@@ -1110,9 +1124,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {

 static ncclResult_t commDestroy(ncclComm_t comm) {
  int savedDevice;
-#ifdef ENABLE_TRACE
-  int rank = comm->rank;
-#endif
  CUDACHECK(hipGetDevice(&savedDevice));
  int commDevice = comm->cudaDev;

@@ -1120,7 +1131,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
    CUDACHECK(hipSetDevice(commDevice));
  }

-  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, LOAD(comm->abortFlag), comm->fatalError);

  CUDACHECK(hipStreamSynchronize(comm->groupStream));
  NCCLCHECK(ncclProxyDestroy(comm));
@@ -1129,13 +1140,14 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
  if (savedDevice != commDevice)
    CUDACHECK(hipSetDevice(savedDevice));

-  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);

  return ncclSuccess;
 }

 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
 ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  if (comm == NULL)
    return ncclSuccess;

@@ -1152,6 +1164,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {

 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  if (comm == NULL)
    return ncclSuccess;

@@ -1186,6 +1199,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {

 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
  *count = comm->nRanks;
@@ -1194,6 +1208,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {

 NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
  *devid = comm->cudaDev;
@@ -1202,6 +1217,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {

 NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
 ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
  *rank = comm->rank;
@@ -46,26 +46,19 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
  }
  // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
  info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast
-    || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll) {
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
    info->count = info->nBytes;
    info->datatype = ncclInt8;
  }
-  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter
-    || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAll)
-    info->nBytes *= info->comm->nRanks; // count is per rank
-  if (info->coll == ncclCollAllToAllv) {
-    // Use count to store data type size for alltoallv
-    info->count = ncclTypeSize(info->datatype);
-    info->datatype = ncclInt8;
-  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
  if (info->op < 0 || info->op >= ncclNumOps) {
    WARN("%s : invalid reduction operation %d", info->opName, info->op);
    return ncclInvalidArgument;
  }

  if (info->comm->checkPointers) {
-    if (info->coll == ncclCollSendRecv) {
+    if (info->coll == ncclFuncSendRecv) {
      if (strcmp(info->opName, "Send") == 0) {
        NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
      } else {
@@ -73,10 +66,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
      }
    } else {
      // Check CUDA device pointers
-      if ((info->coll != ncclCollBroadcast && info->coll != ncclCollScatter) || info->comm->rank == info->root) {
+      if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
        NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
      }
-      if ((info->coll != ncclCollReduce && info->coll != ncclCollGather) || info->comm->rank == info->root) {
+      if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
        NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
      }
    }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -16,14 +16,11 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
    nvmlNvLinkCapability_t capability, unsigned int *capResult);
-static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
 static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);

 // Used to make the NVML library calls thread safe
@@ -74,10 +71,7 @@ ncclResult_t wrapNvmlSymbols(void) {
  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -91,9 +85,6 @@ teardown:
  nvmlInternalShutdown = NULL;
  nvmlInternalDeviceGetHandleByPciBusId = NULL;
  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceGetHandleByIndex = NULL;
-  nvmlInternalDeviceGetPciInfo = NULL;
-  nvmlInternalDeviceGetMinorNumber = NULL;
  nvmlInternalDeviceGetNvLinkState = NULL;
  nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
  nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -162,51 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
  return ncclSuccess;
 }

-ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  if (nvmlInternalDeviceGetPciInfo == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetPciInfo() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
-  if (nvmlInternalDeviceGetMinorNumber == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
  if (nvmlInternalDeviceGetNvLinkState == NULL) {
    /* Do not warn, this symbol is optional. */
@@ -6,10 +6,10 @@

 #include "comm.h"
 #include "info.h"
+#include "graph.h"
 #include "collectives.h"

-#define RECV 0
-#define SEND 1
+enum { proxyRecv=0, proxySend=1 };

 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -19,15 +19,13 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
  const int myrank = 0, nextrank = 1, prevrank = nranks-1;
  int index = pattern == ncclPatternPipelineFrom ?
      /*                            no recv /  no send    if root = */
-      /* bcast  */ (type == RECV ?   myrank : nextrank ):
-      /* reduce */ (type == RECV ? prevrank :   myrank );
+      /* bcast  */ (type == proxyRecv ?   myrank : nextrank ):
+      /* reduce */ (type == proxyRecv ? prevrank :   myrank );
  int rank = ring->userRanks[index];
  return (root != rank);
 }

-enum { proxyRecv=0, proxySend=1 };
-
-#define PROXYARGS_ALLOCATE_SIZE 32
+#define PROXYARGS_ALLOCATE_SIZE 128
 struct ncclProxyPool {
  struct ncclProxyPool *next;
  struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
@@ -36,7 +34,7 @@ struct ncclProxyPool {
 static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
  struct ncclProxyState* state = &comm->proxyState;
  struct ncclProxyArgs* elem;
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->poolMutex);
  if (state->pool == NULL) {
    // Allocate a new pool of elements
    struct ncclProxyPool* newPool;
@@ -54,39 +52,113 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
  }
  elem = state->pool;
  state->pool = state->pool->next;
-  pthread_mutex_unlock(&state->mutex);
-  elem->next = elem->nextPeer = NULL;
+  pthread_mutex_unlock(&state->poolMutex);
+  elem->next = elem->nextPeer = elem->nextGroup = NULL;
  *argsptr = elem;
  return ncclSuccess;
 }

-static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
-  struct ncclComm* comm = connector->comm;
-  struct ncclProxyState* state = &comm->proxyState;
-  pthread_mutex_lock(&state->mutex);
-  if (connector->proxyAppend == NULL) {
-    // Nothing running for that peer. Add to the circular list
-    if (state->ops == NULL) {
-      // Create the list
-      args->next = args;
-      state->ops = args;
-    } else {
-      // Insert element in the list
-      args->next = state->ops->next;
-      state->ops->next = args;
+//#define DEBUG_PROXY 1
+#ifdef DEBUG_PROXY
+#define DEBUG_PROXY_PRINT printf
+#else
+#define DEBUG_PROXY_PRINT(...)
+#endif
+
+#define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
+#define OP_SEEN 0x100000
+ncclResult_t dumpProxyState(struct ncclProxyState* state) {
+#ifdef DEBUG_PROXY
+  struct ncclProxyArgs* op = state->ops;
+  while (op) {
+    if (op->idle & OP_SEEN) {
+      WARN("Active list loop at element %ld\n", OP_INDEX(op));
+    }
+    op->idle |= OP_SEEN;
+    printf("[%ld]", OP_INDEX(op));
+    if (op->nextPeer) {
+      printf("(%ld)", OP_INDEX(op->nextPeer));
+      struct ncclProxyArgs* n = op->nextPeer;
+      n->idle |= OP_SEEN;
+      while (n->nextGroup || n->nextPeer) {
+        n = n->nextGroup ? n->nextGroup : n->nextPeer;
+        n->idle |= OP_SEEN;
+      }
+    }
+    if (op->nextGroup)  {
+      printf("--G->");
+      op = op->nextGroup;
+    } else {
+      printf("--N->");
+      op = op->next;
    }
-    connector->proxyAppend = args;
-  } else {
-    // There is an active operation already for that peer.
-    // Add it to the per-peer list
-    connector->proxyAppend->nextPeer = args;
-    connector->proxyAppend = args;
  }
-  pthread_mutex_unlock(&state->mutex);
+  printf("[X]\n");
+
+  struct ncclProxyArgs* free = state->pool;
+  while (free) {
+    if (free->idle & OP_SEEN) {
+      WARN("Free list loop at element %ld\n", OP_INDEX(free));
+    }
+    free->idle |= OP_SEEN;
+    free = free->next;
+  }
+
+  struct ncclProxyPool* p = state->pools;
+  int i = 0;
+  while (p) {
+    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
+      if ((p->elems[e].idle & OP_SEEN) == 0) {
+        WARN("Element %d of pool %d has been lost\n", e, i);
+        struct ncclProxyArgs* free = state->pool;
+        printf("Free list ");
+        while (free) {
+          printf("--> %ld ", OP_INDEX(free));
+          free = free->next;
+        }
+        printf("\n");
+        return ncclInternalError;
+      }
+      p->elems[e].idle -= OP_SEEN;
+    }
+    p = p->next;
+    i++;
+  }
+#endif
+  return ncclSuccess;
 }

-template <int type>
-static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) {
+  struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
+  if (proxyAppend) {
+    if (shared && proxyAppend->opCount == args->opCount) {
+      args->next = proxyAppend->next;
+      proxyAppend->next = NULL;
+      proxyAppend->nextGroup = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next));
+    } else {
+      proxyAppend->nextPeer = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld                  : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+    }
+  } else {
+    // Nothing running for that peer. Add to the list
+    if (state->ops == NULL) {
+      // Create the list
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as first element                            : \n", OP_INDEX(args), shared, args->opCount);
+      state->ops = args;
+    } else {
+      // Append element at the end of the list
+      struct ncclProxyArgs* last = state->ops;
+      while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next;
+      last->next = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element                             : \n", OP_INDEX(args),shared, args->opCount);
+    }
+  }
+  *(args->proxyAppendPtr) = args;
+  return ncclSuccess;
+}
+
+static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
  if (peer < 0) return ncclSuccess;

  struct ncclPeer* peerComm = args->channel->peers+peer;
@@ -98,107 +170,168 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
  }
  if (connector->transportComm->proxy == NULL) return ncclSuccess;

+  struct ncclProxyState* state = &connector->comm->proxyState;
  struct ncclProxyArgs* op;
  NCCLCHECK(allocateArgs(connector->comm, &op));
  memcpy(op, args, sizeof(struct ncclProxyArgs));
  op->connector = connector;
  op->progress = connector->transportComm->proxy;
  op->state = ncclProxyOpReady;
-  ProxyAppend(connector, op);
+
+  op->proxyAppendPtr =
+    connector->conn.shared ?
+    state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers
+    &connector->proxyAppend;  // Dedicated buffers
+
+  if (state->nextOps == NULL) state->nextOps = op;
+  else state->nextOpsEnd->next = op;
+  state->nextOpsEnd = op;
  return ncclSuccess;
 }

 ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
    struct ncclRing* ring = &args->channel->ring;
-    if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
-    if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+    if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args));
+    if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args));
  }
  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
    // Tree up
-    struct ncclTree* tree = &args->channel->treeUp;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
-    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args));
+    NCCLCHECK(SaveProxy(proxySend, tree->up, args));
  }
  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
    // Tree down
-    struct ncclTree* tree = &args->channel->treeDn;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args));
+    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
  }
  if (pattern == ncclPatternCollTreeUp) {
    // CollTree up
-    struct ncclTree* tree = &args->channel->collTreeUp;
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
-    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+    struct ncclTree* tree = &args->channel->collTree;
+    NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args));
+    NCCLCHECK(SaveProxy(proxySend, tree->up, args));
  }
  if (pattern == ncclPatternCollTreeDown) {
    // CollTree down
-    struct ncclTree* tree = &args->channel->collTreeDn;
-    NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+    struct ncclTree* tree = &args->channel->collTree;
+    NCCLCHECK(SaveProxy(proxySend, tree->down[0], args));
+    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
  }
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) {
  struct ncclProxyArgs args;
  memset(&args, 0, sizeof(struct ncclProxyArgs));
  args.channel = channel;
  args.sliceSteps = 1;
  args.chunkSteps = 1;
  args.protocol = NCCL_PROTO_SIMPLE;
-  args.opCount = info->comm->opCount;
+  args.segment = segment;
+  args.opCount = channel->workFifoTail-1;
  args.dtype = info->datatype;
-  if (info->delta > 0 && info->sendbytes >= 0) {
-    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
-    args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
-    if (args.nsteps == 0) args.nsteps = 1;
-    NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
-  }
  if (info->delta > 0 && info->recvbytes >= 0) {
    int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
    args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
    if (args.nsteps == 0) args.nsteps = 1;
-    NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
+    args.recvbytes = info->recvbytes;
+    args.sendbytes = 0;
+    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, &args));
+  }
+  if (info->delta > 0 && info->sendbytes >= 0) {
+    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
+    args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
+    if (args.nsteps == 0) args.nsteps = 1;
+    args.sendbytes = info->sendbytes;
+    args.recvbytes = 0;
+    NCCLCHECK(SaveProxy(proxySend, peersend, &args));
  }
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info) {
-  const int peersPerChan = DIVUP(info->comm->nRanks, info->nChannels);
-  const int chunkSize  = info->comm->buffSizes[info->protocol]/NCCL_STEPS*info->chunkSteps;
-  const int loopSize = (info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1)*info->nchunksPerLoop*chunkSize;
-  for (int p=0; p<peersPerChan; p++) {
-    if ((peersPerChan == 1 && args->channel->id >= (info->nChannels/info->comm->nRanks)*info->comm->nRanks) ||
-      (peersPerChan > 1 && args->channel->id*peersPerChan+p >= info->comm->nRanks))
-      continue;
-    // first channel is reserved for self copy
-    if ((args->channel->id*peersPerChan+p)%info->comm->nRanks == 0)
-      continue;
-    int peerSend = (info->comm->rank+(args->channel->id*peersPerChan)+p)%info->comm->nRanks;
-    int peerRecv = (2*info->comm->nRanks+info->comm->rank-(args->channel->id*peersPerChan)%info->comm->nRanks-p%info->comm->nRanks)%info->comm->nRanks;
-    if (info->coll == ncclCollAllToAll || (info->coll == ncclCollScatter && info->comm->rank == info->root) ||
-      (info->coll == ncclCollGather && peerSend == info->root))
-      NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
-    if (info->coll == ncclCollAllToAll || (info->coll == ncclCollGather && info->comm->rank == info->root) ||
-      (info->coll == ncclCollScatter && peerRecv == info->root))
-      NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
-    if (info->coll == ncclCollAllToAllv) {
-      info->nBytes = info->sendcounts[peerSend]*info->count;
-      int nLoops = (int)(DIVUP(info->nBytes, loopSize));
-      args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
-      TRACE(NCCL_NET,"peerSend %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-          peerSend, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
-          nLoops, args->nsteps, info->comm);
-      NCCLCHECK(SaveProxy<proxySend>(peerSend, args));
-      info->nBytes = info->recvcounts[peerRecv]*info->count;
-      nLoops = (int)(DIVUP(info->nBytes, loopSize));
-      args->nsteps = info->nstepsPerLoop*nLoops*info->chunkSteps;
-      TRACE(NCCL_NET,"peerRecv %d opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-          peerRecv, args->opCount, args->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
-          nLoops, args->nsteps, info->comm);
-      NCCLCHECK(SaveProxy<proxyRecv>(peerRecv, args));
+static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) {
+  struct ncclProxyArgs* freeOp = *opPtr;
+  DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup));
+  if (*prevGroupPtr && *prevOpPtr) return ncclInternalError;
+  if (freeOp->nextGroup) {
+    // Part of a group : remove the element
+    struct ncclProxyArgs* next = freeOp->nextGroup;
+    *opPtr = next;
+    if (*prevGroupPtr) {
+      (*prevGroupPtr)->nextGroup = next;
+    } else if (*prevOpPtr) {
+      (*prevOpPtr)->next = next;
+    } else {
+      state->ops = next;
+    }
+  } else {
+    struct ncclProxyArgs* next = freeOp->next;
+    *opPtr = next;
+    if ((*prevGroupPtr)) {
+      (*prevGroupPtr)->next = next;
+      (*prevGroupPtr)->nextGroup = NULL;
+      (*prevGroupPtr)->nextPeer = freeOp->nextPeer;
+      if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr;
+      (*prevOpPtr) = *prevGroupPtr;
+      (*prevGroupPtr) = NULL;
+    } else {
+      if (freeOp->nextPeer) {
+        // replace op by nextPeer
+        struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
+        if (*prevOpPtr) {
+          (*prevOpPtr)->next = nextPeer;
+        } else {
+          state->ops = nextPeer;
+        }
+        struct ncclProxyArgs* lastGroup = nextPeer;
+        while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup;
+        lastGroup->next = next;
+        *(prevOpPtr) = lastGroup;
+      } else {
+        *(freeOp->proxyAppendPtr) = NULL;
+        if (*prevOpPtr) {
+          (*prevOpPtr)->next = next;
+        } else {
+          state->ops = next;
+        }
+      }
+    }
+  }
+  pthread_mutex_lock(&state->poolMutex);
+  freeOp->next = state->pool;
+  state->pool = freeOp;
+  pthread_mutex_unlock(&state->poolMutex);
+  DEBUG_PROXY_PRINT("Removed %5ld (%5ld)                                               : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+  NCCLCHECK(dumpProxyState(state));
+  return ncclSuccess;
+}
+
+static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
+  struct ncclProxyArgs* prevOp = NULL;
+  struct ncclProxyArgs* prevGroup = NULL;
+  struct ncclProxyArgs* op = *opsPtr;
+  while (op) {
+    if (op->state == ncclProxyOpNone) return ncclInternalError;
+    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+    // yet and might be cancelled before they even start. Hold on on those.
+    if (op->opCount < comm->lastOpCount) {
+      NCCLCHECK(op->progress(op));
+      *idle &= op->idle;
+    }
+    if (op->state == ncclProxyOpNone) {
+      NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup));
+    } else {
+      if (op->nextGroup) {
+        prevGroup = op;
+        prevOp = NULL;
+        op = op->nextGroup;
+      } else {
+        prevOp = op;
+        prevGroup = NULL;
+        op = op->next;
+      }
    }
  }
  return ncclSuccess;
@@ -207,91 +340,170 @@ ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info)
 void* persistentThread(void *comm_) {
  struct ncclComm* comm = (struct ncclComm*)comm_;
  struct ncclProxyState* state = &comm->proxyState;
-  struct ncclProxyArgs* op = NULL;
-  ncclResult_t ret = ncclSuccess;
-  int idle = 1;
-  int idleSpin = 0;
+  char threadName[16];
+  sprintf(threadName, "NCCLproxy %5d", comm->rank);
+  nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
+
+  pthread_mutex_lock(&state->opsMutex);
+  struct ncclProxyArgs** opsPtr = &state->ops;
  while (1) {
-    do {
-      if (*comm->abortFlag) return NULL;
-      if (op == NULL) {
-        pthread_mutex_lock(&state->mutex);
-        op = state->ops;
-        if (op == NULL) {
-          if (state->stop) {
-            // No more commands to process and proxy has been requested to stop
-            pthread_mutex_unlock(&state->mutex);
-            return NULL;
-          }
-          pthread_cond_wait(&state->cond, &state->mutex);
-        }
-        pthread_mutex_unlock(&state->mutex);
+    if (LOAD(comm->abortFlag)) {
+      pthread_mutex_unlock(&state->opsMutex);
+      return NULL;
+    }
+
+    while (LOAD(opsPtr) == NULL) {
+      if (state->stop) {
+        // No more commands to process and proxy has been requested to stop
+        pthread_mutex_unlock(&state->opsMutex);
+        return NULL;
      }
-    } while (op == NULL);
-    op->idle = 0;
-    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
-    // yet and might be cancelled before they even start. Hold on on those.
-    if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
+      pthread_cond_wait(&state->cond, &state->opsMutex);
+    }
+    int idle = 1;
+    ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
    if (ret != ncclSuccess) {
      comm->fatalError = ret;
      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+      pthread_mutex_unlock(&state->opsMutex);
      return NULL;
    }
-    idle &= op->idle;
-    pthread_mutex_lock(&state->mutex);
-    if (!idle) idleSpin = 0;
-    struct ncclProxyArgs *next = op->next;
-    if (next->state == ncclProxyOpNone) {
-      struct ncclProxyArgs *freeOp = next;
-      if (next->nextPeer) {
-        // Replace next by its next per-peer element.
-        next = next->nextPeer;
-        if (op != freeOp) {
-          next->next = freeOp->next;
-          op->next = next;
-        } else {
-          next->next = next;
-        }
-      } else {
-        // Remove next from circular list
-        next->connector->proxyAppend = NULL;
-        if (op != freeOp) {
-          next = next->next;
-          op->next = next;
-        } else {
-          next = NULL;
-        }
-      }
-      if (freeOp == state->ops) state->ops = next;
-      freeOp->next = state->pool;
-      state->pool = freeOp;
+    if (idle) {
+      pthread_mutex_unlock(&state->opsMutex);
+      sched_yield(); // No request progressed. Let others run.
+      pthread_mutex_lock(&state->opsMutex);
    }
-    op = next;
-    if (op == state->ops) {
-      if (idle == 1) {
-        if (++idleSpin == 10) {
-          sched_yield();
-          idleSpin = 0;
-        }
-      }
-      idle = 1;
-    }
-    pthread_mutex_unlock(&state->mutex);
  }
 }

 ncclResult_t ncclProxyStart(struct ncclComm* comm) {
-  pthread_mutex_lock(&comm->proxyState.mutex);
-  if (comm->proxyState.ops != NULL)
-    pthread_cond_signal(&comm->proxyState.cond);
-  pthread_mutex_unlock(&comm->proxyState.mutex);
+  struct ncclProxyState* state = &comm->proxyState;
+  pthread_mutex_lock(&state->opsMutex);
+
+  // Sort operations as we append them : collectives and
+  // receives first, then sends.
+  ncclProxyArgs* next, *prev = NULL, *op = state->nextOps;
+  while (op) {
+    next = op->next;
+    if (op->sendbytes) {
+      if (prev) prev->next = next;
+      else state->nextOps = next;
+      op->next = NULL;
+      NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
+    } else prev = op;
+    op = next;
+  }
+  op = state->nextOps;
+  while (op) {
+    next = op->next;
+    op->next = NULL;
+    NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
+    op = next;
+  }
+  state->nextOps = state->nextOpsEnd = NULL;
+  NCCLCHECK(dumpProxyState(state));
+
+  if (state->ops != NULL)
+    pthread_cond_signal(&state->cond);
+  pthread_mutex_unlock(&state->opsMutex);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2);
+
+ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  if (state == NULL) {
+    NCCLCHECK(ncclCalloc(&state, 1));
+    comm->proxyState.sharedBuffs = state;
+    state->nslots = ncclParamProxySharedBuffersCount();
+    if (state->nslots == -2)  {
+      state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS;
+    }
+    state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
+  }
+
+  char* buff;
+  int* used;
+  *size = 2*comm->p2pnChannels*state->slotSize*state->nslots;
+
+  if (cuda && state->cudaBuff[0] == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda));
+    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
+    for (int i=0; i<2*comm->p2pnChannels; i++) {
+      state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
+      state->cudaUsed[i] = used + state->nslots*i;
+    }
+  } else if (state->hostBuff[0] == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
+    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
+    for (int i=0; i<2*comm->p2pnChannels; i++) {
+      state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
+      state->hostUsed[i] = used + state->nslots*i;
+    }
+  }
+  buff = cuda ? state->cudaBuff[0] : state->hostBuff[0];
+
+  *ptr = buff;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  // Use different pools for different channels and also separate send/recv.
+  int p = 2*channel+type;
+  int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
+  char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
+  if (buff == NULL) return ncclInternalError;
+  int nslots = 1;
+  while (nslots*state->slotSize < size) nslots *= 2;
+  for (int s=0; s<state->nslots; s+=nslots) {
+    int u = 0;
+    for (int i=0; i<nslots; i++) u += used[s+i];
+    if (u == 0) {
+      for (int i=0; i<nslots; i++) used[s+i] = 1;
+      *ptr = buff+state->slotSize*s;
+      return ncclSuccess;
+    }
+  }
+  *ptr = NULL;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  int p = 2*channel+type;
+  int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
+  char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
+  if (buff == NULL) return ncclInternalError;
+  int nslots = 1;
+  while (nslots*state->slotSize < size) nslots *= 2;
+  int s = (ptr-buff)/state->slotSize;
+  if (s < 0 || s+nslots > state->nslots) {
+    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
+    return ncclInternalError;
+  }
+  for (int i=0; i<nslots; i++) used[s+i] = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  if (state) {
+    CUDACHECK(hipFree(state->cudaBuff[0]));
+    free(state->cudaUsed[0]);
+    NCCLCHECK(ncclCudaHostFree(state->hostBuff[0]));
+    free(state->hostUsed[0]);
+    free(state);
+  }
  return ncclSuccess;
 }

 ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
  if (!comm->proxyThread) {
    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
-    comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
    comm->proxyState.ops = NULL;
    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
  }
@@ -302,21 +514,23 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
  struct ncclProxyState* state = &comm->proxyState;

  // Request the proxy to stop and then wake it
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->opsMutex);
  state->stop = true;
  pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->mutex);
+  pthread_mutex_unlock(&state->opsMutex);
  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);

  // Free off any memory allocated for the proxy arg pools
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->poolMutex);
  struct ncclProxyState* proxyState = &comm->proxyState;
  while (proxyState->pools != NULL) {
    struct ncclProxyPool *next = proxyState->pools->next;
    free(proxyState->pools);
    proxyState->pools = next;
  }
-  pthread_mutex_unlock(&state->mutex);
+  pthread_mutex_unlock(&state->poolMutex);
+
+  NCCLCHECK(ncclProxySharedBuffersDestroy(comm));

  return ncclSuccess;
 }
@@ -20,15 +20,15 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
 };

 template <int type>
-static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
+static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
  for (int t=0; t<NTRANSPORTS; t++) {
    struct ncclTransport *transport = ncclTransports+t;
    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
    int ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
    if (ret) {
      connector->transportComm = transportComm;
-      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
+      NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
      return ncclSuccess;
    }
  }
@@ -36,51 +36,86 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
  return ncclInternalError;
 }

-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
-  struct ncclConnect connect;
-  struct ncclConnector* conn;
+  uint32_t mask = 1 << channel->id;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) { ++nSkippedRecv; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
+    comm->connectRecv[peer] |= mask;
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) { ++nSkippedSend; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
+    comm->connectSend[peer] |= mask;
+  }
+  return ncclSuccess;
+}
+
+void dumpData(struct ncclConnect* data, int ndata) {
+  for (int n=0; n<ndata; n++) {
+    printf("[%d] ", n);
+    uint8_t* d = (uint8_t*)data;
+    for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
+    printf("\n");
+  }
+}
+
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
+  struct ncclConnect data[2*MAXCHANNELS];
+  for (int i=1; i<comm->nRanks; i++) {
+    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+    int sendPeer = (comm->rank + i) % comm->nRanks;
+    uint32_t recvMask = comm->connectRecv[recvPeer];
+    uint32_t sendMask = comm->connectSend[sendPeer];
+
+    struct ncclConnect* recvData = data;
+    int sendChannels = 0, recvChannels = 0;
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (recvMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
+        NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
+      }
+    }
+    struct ncclConnect* sendData = recvData+recvChannels;
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (sendMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
+        NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
+      }
+    }
+
+    if (sendPeer == recvPeer) {
+      if (recvChannels+sendChannels) {
+         NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         sendData = data;
+         recvData = data+sendChannels;
+      }
+    } else {
+      if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
+      if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
+      if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
+      if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
+    }
+
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (sendMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
+        NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
+        conn->connected = 1;
+        CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
+      }
+    }
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (recvMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
+        NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
+        conn->connected = 1;
+        CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
+      }
+    }
+    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
  }
-  for (int i=0; i<nsend; i++) {
-    int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) {++nSkippedSend; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
-    conn->connected = 1;
-    CUDACHECK(hipMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-  }
-  for (int i=0; i<nrecv; i++) {
-    int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) {++nSkippedRecv; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
-    conn->connected = 1;
-    CUDACHECK(hipMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-  }
-  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
  return ncclSuccess;
 }
@@ -27,10 +27,8 @@ struct reqSlot {

 struct collNetSendResources {
  void* collNetSendComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
  uint32_t* llData;
  int netDev;
  int useGdr;
@@ -46,10 +44,8 @@ struct collNetSendResources {
 struct collNetRecvResources {
  void* netListenComm;
  void* collNetRecvComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
  uint32_t* llData;
  int netDev;
  int useGdr;
@@ -68,16 +64,15 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
 }

 /* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
  struct collNetSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;

-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));

-  NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
-  resources->devHostSendMem = resources->hostSendMem;
+  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));

  int recvSize = offsetof(struct ncclRecvMem, buff);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
@@ -85,8 +80,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
  if (resources->useGdr) {
    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
  }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
-  resources->devHostRecvMem = resources->hostRecvMem;
+  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
  NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));

  INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
@@ -95,16 +89,15 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
 }

 /* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
  struct collNetRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;

-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));

-  NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
-  resources->devHostSendMem = resources->hostSendMem;
+  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));

  int recvSize = offsetof(struct ncclRecvMem, buff);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
@@ -112,8 +105,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
  if (resources->useGdr) {
    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
  }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
-  resources->devHostRecvMem = resources->hostRecvMem;
+  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));

  NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));

@@ -124,25 +116,25 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
  return ncclSuccess;
 }

-ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);

  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+    send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
    offset += send->comm->buffSizes[p];
  }
  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;

  // Head/Tail/Opcount/Fifos are always on host
-  send->conn.tail = &resources->devHostRecvMem->tail;
-  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
-  send->conn.head = &resources->devHostSendMem->head;
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+  send->conn.tail = &resources->recvMem->tail;
+  send->conn.sizesFifo = resources->recvMem->sizesFifo;
+  send->conn.head = &resources->sendMem->head;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;

  // Get info from recv side
  resources->collNetRank = rank;
@@ -160,24 +152,24 @@ ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, in
  return ncclSuccess;
 }

-ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
  resources->collNetRank = rank;

  // Intermediate buffering on GPU for GPU Direct RDMA
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
    offset += recv->comm->buffSizes[p];
  }
  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;

  // Head/Tail/Opcount are always on host
-  recv->conn.tail = &resources->devHostRecvMem->tail;
-  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.tail = &resources->recvMem->tail;
+  recv->conn.head = &resources->sendMem->head;

  // Connect to coll comm
  collNetHandle_t** handlePtrs = NULL;
@@ -214,8 +206,8 @@ cleanup:

 ncclResult_t collNetSendFree(void* sendTransportResources) {
  struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
  if (resources->collNetSendComm) {
    NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
    NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
@@ -229,12 +221,12 @@ ncclResult_t collNetSendFree(void* sendTransportResources) {

 ncclResult_t collNetRecvFree(void* recvTransportResources) {
  struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
  if (resources->collNetRecvComm) {
    NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
    NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
  }
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
  if (resources->useGdr)
    CUDACHECK(hipFree(resources->devRecvMem));
  free(resources->llData);
@@ -257,96 +249,84 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
    // Round to next multiple of sliceSteps
    resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
    args->state = ncclProxyOpProgress;
  }
+  args->idle = 1;
  if (args->state == ncclProxyOpProgress) {
    int p = args->protocol;
    int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
    char* localBuff = args->connector->conn.buffs[p];
    void* sendMhandle = resources->sendMhandles[p];
    void* recvMhandle = resources->recvMhandles[p];
-    args->idle = 1;
    struct reqSlot* reqFifo = resources->reqFifo;
-    if (args->head < args->end) {
-      int buffSlot = args->tail%NCCL_STEPS;
-      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
-          && reqFifo[buffSlot].recvBuff != NULL) {
-        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
-        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+    int buffSlot = args->transmitted%NCCL_STEPS;
+    if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS
+        && LOAD(&reqFifo[buffSlot].recvBuff) != NULL) {
+      volatile int* sizesFifo = resources->recvMem->sizesFifo;
+      volatile uint64_t* recvTail = &resources->recvMem->tail;
+      if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
+        // We have something to receive, let's check if it's completely ready.
+        int size = LOAD(sizesFifo+buffSlot);
+        char* buff = localBuff+buffSlot*stepSize;
+        int ready = 1;
        if (args->protocol == NCCL_PROTO_LL) {
-          int size = LOAD(sizesFifo+buffSlot);
-          if (size != -1) {
-            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
-            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            int ready = 1;
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
-            }
-            if (ready) {
-              int stepLines = stepSize / sizeof(union ncclLLFifoLine);
-              //separate data from flag
-              uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines;  // each line has two data elements
-              for (int i=0; i<nFifoLines; i++) {
-                volatile uint32_t *d1 = &lines[i].data1;
-                volatile uint32_t *d2 = &lines[i].data2;
-                sendBuff[2*i] = LOAD(d1);
-                sendBuff[2*i+1] = LOAD(d2);
-              }
-              int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
-              NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
-              if (args->requests[buffSlot] != NULL) {
-                TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
-                STORE(sizesFifo+buffSlot, -1);
-                // Make sure size is reset to zero before we update the head.
-                __sync_synchronize();
-                args->tail += args->sliceSteps;
-                args->idle = 0;
-              }
-            }
-          }
-        } else if (args->tail < LOAD(recvTail)) {
-          // Send through network
-          if (LOAD(sizesFifo+buffSlot) != -1) {
-            int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype);
-            NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
-            if (args->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
-              STORE(sizesFifo+buffSlot, -1);
-              // Make sure size is reset to zero before we update the head.
-              __sync_synchronize();
-              args->tail += args->sliceSteps;
-              args->idle = 0;
-            }
+          uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
+          // Pack data into another buffer
+          int stepLines = stepSize / sizeof(union ncclLLFifoLine);
+          uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines;  // each line has two data elements
+          buff = (char*)sendBuff;
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *d1 = &lines[i].data1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            volatile uint32_t *d2 = &lines[i].data2;
+            if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
+            sendBuff[2*i] = LOAD(d1);
+            sendBuff[2*i+1] = LOAD(d2);
          }
+          size = nFifoLines*2*sizeof(uint32_t);
        }
-      }
-      if (args->head < args->tail) {
-        int done, size;
-        int buffSlot = args->head%NCCL_STEPS;
-        NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
-        if (done) {
-          TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
-          reqFifo[buffSlot].size = size;
-          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
-          // (reordered store after store is possible on POWER, though not on x86)
-          __sync_synchronize();
-          reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
-          args->head += args->sliceSteps;
-          STORE(&resources->hostSendMem->head, args->head);
-          args->idle = 0;
+        if (ready) {
+          // Data is ready, try to send.
+          int count = size/ncclTypeSize(args->dtype);
+          NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            STORE(sizesFifo+buffSlot, -1);
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->transmitted += args->sliceSteps;
+            args->idle = 0;
+            return ncclSuccess;
+          }
        }
      }
    }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    // Check whether the network has completed some send operations.
+    if (args->done < args->transmitted) {
+      int done, size;
+      int buffSlot = args->done%NCCL_STEPS;
+      NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
+      if (done) {
+        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
+        STORE(&reqFifo[buffSlot].size, size);
+        // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
+        // (reordered store after store is possible on POWER, though not on x86)
+        __sync_synchronize();
+        STORE(&reqFifo[buffSlot].recvBuff, NULL); // Notify recvProxy
+        args->done += args->sliceSteps;
+        resources->sendMem->head = args->done;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+        return ncclSuccess;
+      }
    }
  }
  return ncclSuccess;
@@ -361,56 +341,79 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
    // Round to next multiple of sliceSteps
    resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->received = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
    args->state = ncclProxyOpProgress;
  }
+  args->idle = 1;
  if (args->state == ncclProxyOpProgress) {
-    args->idle = 1;
    int p = args->protocol;
    int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
    char* localBuff = args->connector->conn.buffs[p];
    void* mhandle = resources->mhandles[p];
    struct reqSlot* reqFifo = resources->reqFifo;
-    if (args->head < args->end) {
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
-        int buffSlot = args->tail%NCCL_STEPS;
-        char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
-        int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
-        reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
-        TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
-        args->tail += args->sliceSteps;
-        args->idle = 0;
-      }
-      if (args->tail > args->head) {
-        int buffSlot = args->head%NCCL_STEPS;
-        if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
-          TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
-          args->head += args->sliceSteps;
-          if (args->protocol == NCCL_PROTO_LL) { // ll
-            // re-attach flag
-            uint32_t flag = args->head;
-            int stepLines = stepSize / sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
-            int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
-            for (int i=0; i<nFifoLines; i++) {
-              lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
-              lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
-            }
-          } else if (args->protocol == NCCL_PROTO_SIMPLE) {
-            if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
-            resources->hostRecvMem->tail = args->head;
+    if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
+      int buffSlot = args->posted%NCCL_STEPS;
+      char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
+      int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
+      STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
+      TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
+      args->posted += args->sliceSteps;
+      args->idle = 0;
+      return ncclSuccess;
+    }
+    if (args->posted > args->received) {
+      int buffSlot = args->received%NCCL_STEPS;
+      if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
+        TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
+        if (args->protocol == NCCL_PROTO_LL) { // ll
+          // re-attach flag
+          uint32_t flag = NCCL_LL_FLAG(args->received + 1);
+          int stepLines = stepSize / sizeof(union ncclLLFifoLine);
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
+          uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
+          int nFifoLines = DIVUP(LOAD(&reqFifo[buffSlot].size), 2*sizeof(uint32_t));
+          for (int i=0; i<nFifoLines; i++) {
+            lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
+            lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
          }
-          args->idle = 0;
        }
+        args->received += args->sliceSteps;
+        if (LOAD(&reqFifo[buffSlot].size) > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
+          NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, LOAD(&reqFifo[buffSlot].size), mhandle, args->requests+buffSlot));
+        } else {
+          args->requests[buffSlot] = NULL;
+        }
+        args->idle = 0;
+        return ncclSuccess;
      }
    }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    if (args->received > args->transmitted) {
+      // Progress flush operations
+      int buffSlot = args->transmitted%NCCL_STEPS;
+      int done = 1;
+      if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        args->transmitted += args->sliceSteps;
+        __sync_synchronize();
+        resources->recvMem->tail = args->transmitted;
+        args->idle = 0;
+        return ncclSuccess;
+      }
+    }
+    if (args->transmitted > args->done) {
+      volatile uint64_t* sendHead = &resources->sendMem->head;
+      uint64_t done = LOAD(sendHead);
+      while (done > args->done &&
+             // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+             args->transmitted > args->done) {
+        args->done += args->sliceSteps;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+      }
    }
  }
  return ncclSuccess;
@@ -9,7 +9,7 @@
 #include "net.h"
 #include "graph.h"
 #include <sys/time.h>
-#include <numaif.h>
+#include "collectives.h"

 struct netConnectInfo {
  ncclNetHandle_t netHandle;
@@ -25,6 +25,7 @@ struct netSendResources {
  struct ncclRecvMem* recvMem;
  int netDev;
  int useGdr;
+  int shared;
  char* buffers[LOC_COUNT];
  int buffSizes[LOC_COUNT];
  void* mhandles[LOC_COUNT];
@@ -40,6 +41,7 @@ struct netRecvResources {
  struct ncclRecvMem* recvMem;
  int netDev;
  int useGdr;
+  int shared;
  char* buffers[LOC_COUNT];
  int buffSizes[LOC_COUNT];
  void* mhandles[LOC_COUNT];
@@ -55,118 +57,118 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  return ncclSuccess;
 }

+NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+
 /* Determine if we will use this transport for this peer and return connect
 * information for this peer */
-ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
  struct netSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
+  send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;

-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));

  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));

  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
  send->conn.tail = &resources->recvMem->tail;
-  send->conn.fifo = resources->recvMem->sizesFifo;
+  send->conn.sizesFifo = resources->recvMem->sizesFifo;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
  send->conn.head = &resources->sendMem->head;
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+  resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;

-  int protoLoc[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+  if (resources->shared == 0) {
+    int protoLoc[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    }
+    int buffSizes[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      buffSizes[p] = send->comm->buffSizes[p];
+      resources->buffSizes[protoLoc[p]] += buffSizes[p];
+    }
+
+    if (resources->buffSizes[LOC_DEVMEM]) {
+      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
+    }
+    if (resources->buffSizes[LOC_HOSTMEM]) {
+      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+    }
+
+    int offsets[LOC_COUNT];
+    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+      send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+      offsets[protoLoc[p]] += buffSizes[p];
+    }
  }

-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    // Only allocate buffers for simple for p2p connections
-    buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
-    resources->buffSizes[protoLoc[p]] += buffSizes[p];
-  }
-
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
-  }
-  char line[16];
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-    int status[1] = {-1};
-    line[0]= 0;
-    if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
-      sprintf(line, "/MEM%d", status[0]);
-  }
-
-  int offsets[LOC_COUNT];
-  offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-    send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-    offsets[protoLoc[p]] += buffSizes[p];
-  }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : line);
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
  return ncclSuccess;
 }

-ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
  struct netRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
+  recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;

-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));

  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));

  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
  recv->conn.tail = &resources->recvMem->tail;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
  recv->conn.head = &resources->sendMem->head;

-  int protoLoc[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
+    int protoLoc[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    }
+
+    int buffSizes[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      buffSizes[p] = recv->comm->buffSizes[p];
+      resources->buffSizes[protoLoc[p]] += buffSizes[p];
+    }
+
+    if (resources->buffSizes[LOC_DEVMEM]) {
+      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
+    }
+    if (resources->buffSizes[LOC_HOSTMEM]) {
+      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+    }
+
+    int offsets[LOC_COUNT];
+    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+      recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+      offsets[protoLoc[p]] += buffSizes[p];
+    }
  }

-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    // Only allocate buffers for simple for p2p connections
-    buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
-    resources->buffSizes[protoLoc[p]] += buffSizes[p];
-  }
-
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
-  }
-  char line[16];
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-    int status[1] = {-1};
-    line[0]= 0;
-    if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
-      sprintf(line, "/MEM%d", status[0]);
-  }
-
-  int offsets[LOC_COUNT];
-  offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-    recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-    offsets[protoLoc[p]] += buffSizes[p];
-  }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : line);
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));

  return ncclSuccess;
 }

-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
@@ -174,6 +176,13 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
  // Connect to remote peer
  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));

+  if (resources->shared) {
+    // Get shared buffers
+    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
+    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+  }
+
  if (resources->buffSizes[LOC_DEVMEM]) {
    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
  }
@@ -184,7 +193,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
 }

 /* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;

@@ -192,6 +201,13 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));

+  if (resources->shared) {
+    // Get shared buffers
+    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
+    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+  }
+
  if (resources->buffSizes[LOC_DEVMEM]) {
    NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
  }
@@ -209,8 +225,10 @@ ncclResult_t netSendFree(void* transportResources) {
    if (resources->buffers[l])
      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
  }
-  NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-  CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
+  if (resources->shared == 0) {
+    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+    CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
+  }
  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
  free(resources);
  return ncclSuccess;
@@ -224,138 +242,144 @@ ncclResult_t netRecvFree(void* transportResources) {
    if (resources->buffers[l])
      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
  }
-  NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-  CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
+  if (resources->shared == 0) {
+    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+    CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
+  }
  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
  free(resources);
  return ncclSuccess;
 }

+static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
+
 ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
  struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
  if (args->state == ncclProxyOpReady) {
    // Round to next multiple of sliceSteps
    resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
    args->state = ncclProxyOpProgress;
  }
+  args->idle = 1;
  if (args->state == ncclProxyOpProgress) {
    int p = args->protocol;
    int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
    char* localBuff = args->connector->conn.buffs[p];
    void* mhandle = *(resources->mhandlesProto[p]);
-    args->idle = 1;
-    if (args->head < args->end) {
-      int buffSlot = args->tail%NCCL_STEPS;
-      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
-        volatile int* sizesFifo = resources->recvMem->sizesFifo;
-        volatile uint64_t* recvTail = &resources->recvMem->tail;
+    int buffSize = stepSize*args->sliceSteps;
+    if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
+    if (args->sendbytes < buffSize) buffSize = args->sendbytes;
+    // Post buffers to the GPU
+    if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) {
+      if (resources->shared) {
+        char* ptr;
+        NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr));
+        if (ptr == NULL) return ncclInternalError;
+        resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr;
+        __sync_synchronize();
+        volatile uint64_t* sendHead = &resources->sendMem->head;
+        args->posted += args->sliceSteps;
+        STORE(sendHead, args->posted - NCCL_STEPS);
+      } else args->posted += args->sliceSteps;
+      args->idle = 0;
+      return ncclSuccess;
+    }
+    // Check whether we received data from the GPU and send it to the network
+    int buffSlot = args->transmitted%NCCL_STEPS;
+    if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) {
+      volatile int* sizesFifo = resources->recvMem->sizesFifo;
+      volatile uint64_t* recvTail = &resources->recvMem->tail;
+      if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
+        // We have something to receive, let's check if it's completely ready.
+        int size = LOAD(sizesFifo+buffSlot);
+        char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+        int ready = 1;
        if (args->protocol == NCCL_PROTO_LL128) {
-          if (args->tail < LOAD(recvTail)) {
-            if (LOAD(sizesFifo+buffSlot) != -1) {
-              int ready = resources->useGdr;
-              if (!ready) {
-                // When data is in sysmem, we need to wait until all flags are correct since the GPU only
-                // called threadfence()
-                uint64_t flag = args->tail + 1;
-                int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
-                volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
-                ready = 1;
-                for (int i=0; i<nFifoLines; i++) {
-                  if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
-                }
-              }
-              if (ready) {
-                // Send through network
-                NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), mhandle, args->requests+buffSlot));
-                if (args->requests[buffSlot] != NULL) {
-                  STORE(sizesFifo+buffSlot, -1);
-                  // Make sure size is reset to zero before we update the head.
-                  __sync_synchronize();
-                  args->tail += args->sliceSteps;
-                  args->idle = 0;
-                }
-              }
+          int ready = resources->useGdr;
+          if (!ready) {
+            // When data is in sysmem, we need to wait until all flags are correct since the GPU only
+            // called threadfence()
+            uint64_t flag = args->transmitted + 1;
+            int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+            volatile uint64_t* lines = (volatile uint64_t*)buff;
+            ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              if (LOAD(lines+i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS) != flag) { ready = 0; break; }
            }
          }
        } else if (args->protocol == NCCL_PROTO_LL) {
-          int size = LOAD(sizesFifo+buffSlot);
-          if (size != -1) {
-            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
-            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-            size = nFifoLines * sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            int ready = 1;
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
-            }
-            if (ready) {
-              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
-              if (args->requests[buffSlot] != NULL) {
-                STORE(sizesFifo+buffSlot, -1);
-                // Make sure size is reset to zero before we update the head.
-                __sync_synchronize();
-                args->tail += args->sliceSteps;
-                args->idle = 0;
-              }
-            }
-          }
-        } else if (args->tail < LOAD(recvTail)) {
-          // Send through network
-          if (LOAD(sizesFifo+buffSlot) != -1) {
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
-            if (args->requests[buffSlot] != NULL) {
-#ifdef ENABLE_PROFILING
-              if (args->channel->active_req == 0) {
-                gettimeofday(&args->channel->tvs, NULL);
-                args->channel->sizes = 0;
-              }
-              args->channel->active_req ++;
-              args->channel->sizes += LOAD(sizesFifo+buffSlot);
-              args->channel->send_byte += LOAD(sizesFifo+buffSlot);
-#endif
-              STORE(sizesFifo+buffSlot, -1);
-              // Make sure size is reset to zero before we update the head.
-              __sync_synchronize();
-              args->tail += args->sliceSteps;
-              args->idle = 0;
-            }
+          uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
          }
        }
-      }
-      if (args->head < args->tail) {
-        int done;
-        int buffSlot = args->head%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
-        if (done) {
+        if (ready) {
+          // Data is ready, try to send.
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
 #ifdef ENABLE_PROFILING
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            args->channel->active_req --;
            if (args->channel->active_req == 0) {
-              struct timeval tv;
-              gettimeofday(&tv, NULL);
-              float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
-              if (delta) {
-                args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
-                args->channel->bw_count ++;
-              }
+              gettimeofday(&args->channel->tvs, NULL);
+              args->channel->sizes = 0;
            }
-          }
+            args->channel->active_req ++;
+            args->channel->sizes += LOAD(sizesFifo+buffSlot);
+            args->channel->send_byte += LOAD(sizesFifo+buffSlot);
 #endif
-          args->head += args->sliceSteps;
-          STORE(&resources->sendMem->head, args->head);
-          args->idle = 0;
+            TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            STORE(sizesFifo+buffSlot, -1);
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->transmitted += args->sliceSteps;
+            args->idle = 0;
+            return ncclSuccess;
+          }
        }
      }
    }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    // Check whether the network has completed some send operations.
+    if (args->done < args->transmitted) {
+      int done;
+      int buffSlot = args->done%NCCL_STEPS;
+      NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
+#ifdef ENABLE_PROFILING
+        if (args->protocol == NCCL_PROTO_SIMPLE) {
+          args->channel->active_req --;
+          if (args->channel->active_req == 0) {
+            struct timeval tv;
+            gettimeofday(&tv, NULL);
+            float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec;
+            if (delta) {
+              args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3;
+              args->channel->bw_count ++;
+            }
+          }
+        }
+#endif
+        if (resources->shared) {
+          char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
+          NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr));
+        }
+        args->done += args->sliceSteps;
+
+        if (resources->shared == 0) {
+          resources->sendMem->head = args->done;
+        }
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+        return ncclSuccess;
+      }
    }
  }
  return ncclSuccess;
@@ -366,45 +390,57 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
    // Round to next multiple of sliceSteps
    resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->received = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
    args->state = ncclProxyOpProgress;
  }
+  args->idle = 1;
  if (args->state == ncclProxyOpProgress) {
-    args->idle = 1;
    int p = args->protocol;
    int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
    char* localBuff = args->connector->conn.buffs[p];
    void* mhandle = *(resources->mhandlesProto[p]);
-    if (args->head < args->end) {
-      volatile uint64_t* sendHead = &resources->sendMem->head;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
-        int buffSlot = args->tail%NCCL_STEPS;
-        int sliceSize = stepSize * args->sliceSteps;
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
-        if (args->requests[buffSlot] != NULL) {
-#ifdef ENABLE_PROFILING
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            if (args->channel->active_req == 0) {
-              gettimeofday(&args->channel->tvs, NULL);
-              args->channel->sizes = 0;
-            }
-            args->channel->active_req ++;
-          }
-#endif
-          args->tail += args->sliceSteps;
-          args->idle = 0;
-        }
+    int buffSize = stepSize*args->sliceSteps;
+    if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
+    if (args->recvbytes < buffSize) buffSize = args->recvbytes;
+    if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
+      int buffSlot = args->posted%NCCL_STEPS;
+      char* ptr;
+      if (resources->shared) {
+        NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr));
+        if (ptr == NULL) return ncclInternalError;
+        volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
+        STORE(ptrsFifo+buffSlot, ptr);
+      } else {
+        ptr = localBuff+buffSlot*stepSize;
      }
-      if (args->tail > args->head) {
-        int buffSlot = args->head%NCCL_STEPS;
-        int done, size;
-        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
-        if (done) {
-          args->head += args->sliceSteps;
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
+      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
+      if (args->requests[buffSlot] != NULL) {
+        TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
 #ifdef ENABLE_PROFILING
+        if (args->protocol == NCCL_PROTO_SIMPLE) {
+          if (args->channel->active_req == 0) {
+            gettimeofday(&args->channel->tvs, NULL);
+            args->channel->sizes = 0;
+          }
+          args->channel->active_req ++;
+        }
+#endif
+        args->posted += args->sliceSteps;
+        args->idle = 0;
+        return ncclSuccess;
+      } else if (resources->shared) {
+        NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
+      }
+    }
+    if (args->posted > args->received) {
+      int buffSlot = args->received%NCCL_STEPS;
+      int done, size;
+      NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+      if (done) {
+        args->received += args->sliceSteps;
+#ifdef ENABLE_PROFILING
+        if (args->protocol == NCCL_PROTO_SIMPLE) {
          args->channel->active_req --;
          args->channel->sizes += size;
          args->channel->recv_byte += size;
@@ -417,18 +453,50 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
              args->channel->bw_count ++;
            }
          }
-#endif
-            if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
-            STORE(&resources->recvMem->tail, args->head);
-          }
-          args->idle = 0;
        }
+#endif
+        if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
+          // Don't pass data to the GPU yet, flush first.
+          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
+          char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
+          NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot));
+        } else {
+          args->requests[buffSlot] = NULL;
+        }
+        args->idle = 0;
+        return ncclSuccess;
      }
    }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    if (args->received > args->transmitted) {
+      // Progress flush operations
+      int buffSlot = args->transmitted%NCCL_STEPS;
+      int done = 1;
+      if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        args->transmitted += args->sliceSteps;
+        __sync_synchronize();
+        resources->recvMem->tail = args->transmitted;
+        args->idle = 0;
+        return ncclSuccess;
+      }
+    }
+    if (args->transmitted > args->done) {
+      volatile uint64_t* sendHead = &resources->sendMem->head;
+      uint64_t done = LOAD(sendHead);
+      while (done > args->done &&
+          // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+          args->transmitted > args->done) {
+        if (resources->shared) {
+          char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
+          NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
+        }
+        args->done += args->sliceSteps;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+      }
    }
  }
  return ncclSuccess;
@@ -25,9 +25,8 @@
 #include "ibvwrap.h"

 #define USE_RDMA_WRITE 1
-#define USE_RDMA_SEND_INLINE 0
 #define MAXNAMESIZE 64
-static char ncclIbIfName[MAX_IF_NAME_SIZE];
+static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union socketAddress ncclIbIfAddr;

 static int ncclNIbDevs = -1;
@@ -58,6 +57,8 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
 NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
+NCCL_PARAM(IbPkey, "IB_PKEY", 0);
+NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
@@ -200,7 +201,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
      }
      line[1023] = '\0';
-      char addrline[1024];
+      char addrline[SOCKET_NAME_MAXLEN+1];
      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
    }
    pthread_mutex_unlock(&ncclIbLock);
@@ -251,7 +252,7 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
  return ncclSuccess;
 }

-#define MAX_REQUESTS 128
+#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS

 struct ncclIbQpInfo {
  uint32_t lid;
@@ -272,18 +273,19 @@ struct ncclIbHandle {
  union socketAddress connectAddr;
 };

-struct ncclIbVerbs {
-  struct ibv_pd* pd;
-  struct ibv_cq* cq;
-};
-
 struct ncclIbRequest {
  int used;
  int type;
  struct ncclIbVerbs* verbs;
-  int done;
+  int events;
  int size;
-  int free;
+};
+
+struct ncclIbVerbs {
+  struct ibv_pd* pd;
+  struct ibv_cq* cq;
+  uint64_t pad[2];
+  struct ncclIbRequest reqs[MAX_REQUESTS];
 };

 struct ncclIbListenComm {
@@ -297,18 +299,23 @@ struct alignas(64) ncclIbSendFifo {
  uint32_t seq;
  uint32_t rkey;
  uint32_t ready;
+  uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
 };

 struct ncclIbSendComm {
  struct ncclIbVerbs verbs;
  struct ncclIbSendFifo fifo[MAX_REQUESTS];
-  struct ncclIbRequest reqs[MAX_REQUESTS];
  uint32_t fifoHead;
  int fd;
  int ready;
  struct ibv_qp* qp;
  struct ibv_mr* fifoMr;
 };
+// The SendFifo needs to be 32-byte aligned and each element needs
+// to be a 32-byte multiple, so that an entry does not get split and
+// written out of order when IB Relaxed Ordering is enabled
+static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
+static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");

 struct ncclIbGpuFlush {
  int enabled;
@@ -331,16 +338,17 @@ struct ncclIbRemFifo {
 struct ncclIbRecvComm {
  struct ncclIbVerbs verbs;
  struct ncclIbRemFifo remFifo;
-  struct ncclIbRequest reqs[MAX_REQUESTS];
  int fd;
  int ready;
  struct ibv_qp* qp;
  struct ncclIbGpuFlush gpuFlush;
 };
+static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");

 ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
  NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
-  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
+  // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
+  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS, NULL, NULL, 0));
  return ncclSuccess;
 }

@@ -356,17 +364,17 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
  qpInitAttr.send_cq = verbs->cq;
  qpInitAttr.recv_cq = verbs->cq;
  qpInitAttr.qp_type = IBV_QPT_RC;
-  // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
+  // We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
  qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
  qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
  qpInitAttr.cap.max_send_sge = 1;
  qpInitAttr.cap.max_recv_sge = 1;
-  qpInitAttr.cap.max_inline_data = 0;
+  qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
  NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
  struct ibv_qp_attr qpAttr;
  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
  qpAttr.qp_state = IBV_QPS_INIT;
-  qpAttr.pkey_index = 0;
+  qpAttr.pkey_index = ncclParamIbPkey();
  qpAttr.port_num = ib_port;
  qpAttr.qp_access_flags = access_flags;
  NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
@@ -481,7 +489,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  socklen_t socklen = sizeof(struct sockaddr_in);
  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
  struct ncclIbQpInfo remQpInfo;
-  NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
+  NCCLCHECK(socketRecv(rComm->fd, &remQpInfo, sizeof(remQpInfo)));

  // IB setup
  ibv_context* ctx = ncclIbDevs[lComm->dev].context;
@@ -509,14 +517,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
  rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
  rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
-
-#if USE_RDMA_SEND_INLINE
-  // Determine whether the remFifo element data can be sent INLINE
-  struct ibv_qp_attr attr;
-  struct ibv_qp_init_attr init_attr;
-  NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
-  if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
-#endif
+  if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;

  // Allocate Flush dummy buffer for GPU Direct RDMA
  rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
@@ -553,16 +554,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  return ncclSuccess;
 }

-ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
+ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
  for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclIbRequest* r = reqs+i;
+    struct ncclIbRequest* r = verbs->reqs+i;
    if (r->used == 0) {
      r->used = 1;
      r->type = 0;
-      r->verbs = NULL;
-      r->done = 0;
+      r->verbs = verbs;
+      r->events = 1;
      r->size = -1;
-      r->free = 0;
      *req = r;
      return ncclSuccess;
    }
@@ -571,6 +571,10 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
  *req = NULL;
  return ncclInternalError;
 }
+ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
+  r->used = 0;
+  return ncclSuccess;
+}

 ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
  struct ncclIbQpInfo remQpInfo;
@@ -585,7 +589,6 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
  NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
  NCCLCHECK(ncclIbRtsQp(qp));
  comm->ready = 1;
-
  // Block until this is done. It *should* not block indefinitely.
  NCCLCHECK(socketSend(comm->fd, &comm->ready, sizeof(int)));

@@ -606,6 +609,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
 #define REG_ALIGN (4096)

 ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
  uint64_t addr = (uint64_t)data;
  assert(size > 0);
@@ -639,8 +643,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
  if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }

  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
  req->size = size;

  struct ibv_send_wr wr;
@@ -656,14 +659,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
    wr.sg_list = &sge;
    wr.num_sge = 1;
  }
+#if USE_RDMA_WRITE == 0
  wr.opcode = IBV_WR_SEND;
  wr.send_flags = IBV_SEND_SIGNALED;
-
-  int useAr = 0;
-  if (size > ncclParamIbArThreshold()) {
-    useAr = 1;
-  }
-#if USE_RDMA_WRITE
+#else
  __sync_synchronize(); // order the readyPtr load against rkey load below
  // Sanity checks to catch user collective call count/size mismatches
  // plus any potential programming errors
@@ -672,7 +671,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
        size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
    return ncclInternalError;
  }
+  int useAr = 0;
+  if (size > ncclParamIbArThreshold()) {
+    useAr = 1;
+  }
  wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED;
  wr.wr.rdma.remote_addr = LOAD(&slot->addr);
  wr.wr.rdma.rkey = LOAD(&slot->rkey);
  wr.imm_data = size; // Send the message size via imm_data
@@ -696,7 +700,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
    wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
    wr.sg_list = NULL;
    wr.num_sge = 0;
-    wr.send_flags &= ~IBV_SEND_SIGNALED;
+    wr.send_flags |= IBV_SEND_SIGNALED;
    NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
  }
 #endif
@@ -704,28 +708,51 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
  return ncclSuccess;
 }

-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
  struct ibv_send_wr wr;
  memset(&wr, 0, sizeof(wr));
-  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
-  req->free = 1; // Not a user req ; free as soon as it is complete.
-  wr.wr_id = (uint64_t)req;

-  struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
+  int slot = comm->remFifo.tail%MAX_REQUESTS;
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
  localElem->addr = addr;
  localElem->rkey = rkey;
  localElem->ready = 1;
  localElem->size = size; // Sanity/Debugging
  localElem->seq = comm->remFifo.tail; // Sanity/Debugging
-  wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
  wr.wr.rdma.rkey = comm->remFifo.rkey;
  comm->remFifo.sge.addr = (uint64_t)localElem;
  wr.sg_list = &comm->remFifo.sge;
  wr.num_sge = 1;
  wr.opcode = IBV_WR_RDMA_WRITE;
-  wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
+  wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE
+
+  // We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise
+  // the send queue will never empty.
+  //
+  // From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/
+  // "How to use Unsignaled Completion?" / "Gotchas and Pitfalls"
+  // All posted Send Requested, Signaled and Unsignaled, are considered outstanding until
+  // a Work Completion that they, or Send Requests that were posted after them, was polled
+  // from the Completion Queue associated with the Send Queue. This means if one works with
+  // a Queue Pair that was configured to work with Unsignaled Completions, he must make
+  // sure that occasionally (before the Send Queue is full with outstanding Send Requests)
+  // a Send Request that generate Work Completion will be posted.
+  //
+  // Not following this rule may lead to a case that the Send Queue is full with Send
+  // Requests that won't generate Work Completion:
+  //
+  //  - The Send Queue is full, so no new Send Requests can be posted to it
+  //  - The Send Queue can't be emptied, since no Work Completion can be generated anymore
+  //    (the reason is that no Work Completion, that can generate Work Completion that
+  //    polling it will empty the Send Queue, can be posted)
+  //  - The status of all posted Send Request is considered unknown
+  //
+  if (slot == 0) {
+    wr.send_flags |= IBV_SEND_SIGNALED;
+    wr.wr_id = (uint64_t)req;
+    req->events++;
+  }

  struct ibv_send_wr* bad_wr;
  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
@@ -742,8 +769,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
  struct ibv_mr* mr = (struct ibv_mr*)mhandle;

  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
  req->size = size;

  struct ibv_recv_wr wr;
@@ -765,17 +791,16 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
  *request = req;

  // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
+  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
  return ncclSuccess;
 }

-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
+ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
  if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;

  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
  struct ibv_mr* mr = (struct ibv_mr*)mhandle;

  struct ibv_send_wr wr;
@@ -792,11 +817,7 @@ ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
  struct ibv_send_wr* bad_wr;
  NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));

-  int done = 0;
-  while (done == 0) {
-    NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
-  }
-
+  *request = req;
  return ncclSuccess;
 }

@@ -805,10 +826,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
  *done = 0;

  while (1) {
-    if (r->done == 1) {
+    if (r->events == 0) {
      *done = 1;
      if (size) *size = r->size;
-      r->used = 0;
+      NCCLCHECK(ncclIbFreeRequest(r));
      return ncclSuccess;
    }

@@ -833,11 +854,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
          doneReq->size = wc->imm_data;
 #endif
        }
-        doneReq->done = 1;
-        if (doneReq->free == 1) {
-          // This is an internal (FIFO post) req. Free it immediately.
-          doneReq->used = 0;
-        }
+        doneReq->events--;
      }
    }
  }
@@ -892,7 +909,7 @@ ncclNet_t ncclNetIb = {
  ncclIbDeregMr,
  ncclIbIsend,
  ncclIbIrecv,
-  ncclIbFlush,
+  ncclIbIflush,
  ncclIbTest,
  ncclIbCloseSend,
  ncclIbCloseRecv,
@@ -49,17 +49,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
        WARN("NET/Socket : no interface found");
        return ncclInternalError;
      } else {
-        char line[1024];
-        char addrline[1024];
+        #define MAX_LINE_LEN (2047)
+        char line[MAX_LINE_LEN+1];
+        char addrline[SOCKET_NAME_MAXLEN+1];
        line[0] = '\0';
+        addrline[SOCKET_NAME_MAXLEN] = '\0';
        for (int i=0; i<ncclNetIfs; i++) {
          strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
          NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
+          snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
              socketToString(&addrs[i].sa, addrline));
        }
-        line[1023] = '\0';
+        line[MAX_LINE_LEN] = '\0';
        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
      }
    }
@@ -113,8 +115,7 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {

 #define MAX_SOCKETS 64
 #define MAX_THREADS 16
-#define MAX_REQUESTS 128
-#define MAX_QUEUE_LEN MAX_REQUESTS
+#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
 #define MIN_CHUNKSIZE (64*1024)

 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
@@ -150,6 +151,7 @@ struct ncclSocketRequest {

 struct ncclSocketTaskQueue {
  int next;
+  int len;
  struct ncclSocketTask* tasks;
 };

@@ -189,7 +191,7 @@ void* persistentSocketThread(void *args_) {
  while (1) {
    int idle = 1;
    int mark = myQueue->next; // mark newest task seen
-    for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
+    for (int i=0; i<myQueue->len; i+=nSocksPerThread) {
      int repeat;
      do {
        repeat = 0;
@@ -364,7 +366,11 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
  struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
  // create helper threads and prepare per-thread task queue
  if (queue->tasks == NULL) {
-    NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
+    // each request can be divided up to nSocks tasks, and
+    // these tasks are distributed to nThreads threads,
+    // we need to make sure each thread queue has enough slots for MAX_REQUESTS
+    queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads);
+    NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
    queue->next = 0;
    res->comm = comm;
    pthread_mutex_init(&res->threadLock, NULL);
@@ -383,7 +389,7 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
    r->used = 1;
    *req = r;
    pthread_mutex_lock(&res->threadLock);
-    queue->next = (queue->next+1)%MAX_QUEUE_LEN;
+    queue->next = (queue->next+1)%queue->len;
    res->state = start;
    pthread_cond_signal(&res->threadCond);
    pthread_mutex_unlock(&res->threadLock);
@@ -421,6 +427,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
    // divide into subtasks
    int chunkOffset = 0, i = 0;
    if (r->comm->nSocks > 0) {
+      // each request can be divided up to nSocks tasks
      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
      while (chunkOffset < r->size) {
        int chunkSize = std::min(taskSize, r->size-chunkOffset);
@@ -478,7 +485,7 @@ ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle
  return ncclSuccess;
 }

-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
+ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
  // We don't support CUDA pointers, so we don't need a flush operation
  return ncclInternalError;
 }
@@ -527,7 +534,7 @@ ncclNet_t ncclNetSocket = {
  ncclSocketDeregMr,
  ncclSocketIsend,
  ncclSocketIrecv,
-  ncclSocketFlush,
+  ncclSocketIflush,
  ncclSocketTest,
  ncclSocketClose,
  ncclSocketClose,
@@ -12,26 +12,30 @@
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
 #endif
-#include "shm.h"
+#include "bootstrap.h"

 struct p2pConnectInfo {
-  int direct;
+  int rank;
  int read;
-  union {
-    void* directPtr;
-    hipIpcMemHandle_t devIpc;
-  };
+  void* directPtr;
+  hipIpcMemHandle_t devIpc;
 };

 struct p2pSendResources {
  struct ncclSendMem* devMem;
  void* ipcPtr;
  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  int remoteId;
+  int memRank;
+  void* bootstrap;
 };

 struct p2pRecvResources {
  struct ncclRecvMem* devMem;
  void* ipcPtr;
+  int remoteId;
+  int memRank;
+  void* bootstrap;
 };

 #include <sys/types.h>
@@ -69,9 +73,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  }

  // Check topology / p2p level.
-  int read;
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
+  int intermediateRank;
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
  if (*ret == 0) return ncclSuccess;
+  if (intermediateRank != -1) return ncclSuccess;

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -114,31 +119,52 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);

-static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  int readEnable = ncclParamP2pReadEnable();
-  if (readEnable != -2) return readEnable;
-
-  int p2p, read;
+static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
+  int p2p;
  // Queries the topology to see if the GPUs are Ampere and
  // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));

-  return read;
+  int readEnable = ncclParamP2pReadEnable();
+  if (readEnable != -2) *read = readEnable;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    if (peerInfo->cudaDev != myInfo->cudaDev) {
+      // Enable P2P access
+      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
+        WARN("failed to peer with device %d(=%lx): %d %s",
+            peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
+        return ncclInternalError;
+      }
+    }
+    *devMem = p2pInfo->directPtr;
+    *ipcPtr = NULL;
+  } else {
+    CUDACHECK(hipIpcOpenMemHandle(devMem, p2pInfo->devIpc, hipIpcMemLazyEnablePeerAccess));
+    *ipcPtr = *devMem;
+  }
+  return ncclSuccess;
 }

 /* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
    struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {

  struct p2pSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
-  int useRead = p2pUseRead(topo, myInfo, peerInfo);
+  int useRead, intermediateRank;
+  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
  int sendSize = sizeof(struct ncclSendMem);
  // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
  if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));

  resources->next_hdp_reg = 0;
  uint32_t linktype, hops;
@@ -154,116 +180,84 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  struct p2pConnectInfo info;
  info.read = useRead;
  const char* useReadStr = info.read ? "/read" : "";
-  if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
-          channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
-      return ncclInternalError;
+
+  resources->remoteId = -1;
+  resources->bootstrap = comm->bootstrap;
+  if (intermediateRank == -1) {
+    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true));
+    info.rank = myInfo->rank;
+    if (myInfo->pidHash == peerInfo->pidHash) {
+      if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU;
+      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
    } else {
-      // Enable P2P access
-      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == hipErrorPeerAccessAlreadyEnabled) {
-        hipGetLastError();
-      } else if (err != hipSuccess) {
-        WARN("failed to peer with device %d(=%lx): %d %s",
-             peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
-        return ncclInternalError;
-      }
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+      CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
    }
  } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != hipSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
-      return ncclInternalError;
-    }
-    INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
-        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
-    //TRACE_DUMP_IPC(&info.devIpc);
+    NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
+    info.rank = intermediateRank;
+    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
+        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
+	comm->peerInfo[intermediateRank].busId, useReadStr);
  }
+  resources->memRank = info.rank;
+
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+
  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
  return ncclSuccess;
 }

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
    struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {

  struct p2pRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
-  int useRead = p2pUseRead(topo, myInfo, peerInfo);
+  int useRead, intermediateRank;
+  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
  int recvSize = offsetof(struct ncclRecvMem, buff);
  // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
  ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));

  struct p2pConnectInfo info;
  info.read = useRead;
-  if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+
+  resources->remoteId = -1;
+  resources->bootstrap = comm->bootstrap;
+  if (intermediateRank == -1) {
+    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true));
+    info.rank = myInfo->rank;
+    if (myInfo->pidHash == peerInfo->pidHash) {
+      if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
    } else {
-      // Enable P2P access
-      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == hipErrorPeerAccessAlreadyEnabled) {
-        hipGetLastError();
-      } else if (err != hipSuccess) {
-        WARN("failed to peer with device %d(=%lx): %d %s",
-             peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
-        return ncclInternalError;
-      }
-      TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+      CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
    }
  } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != hipSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
-      return ncclInternalError;
-    }
-    TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
-    //TRACE_DUMP_IPC(&info.devIpc);
+    NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
+    info.rank = intermediateRank;
  }
+  resources->memRank = info.rank;
+
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+
  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
  return ncclSuccess;
 }

 /* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
  struct ncclRecvMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclRecvMem*)(info->directPtr);
-    if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
-    if (err != hipSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, hipGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
+
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));

  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -283,26 +277,12 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
 }

 /* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
  struct ncclSendMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclSendMem*)(info->directPtr);
-    if (info->read == 0) {
-      recv->conn.direct |= NCCL_DIRECT_GPU;
-      recv->conn.ptrExchange = &remDevMem->ptrExchange;
-    }
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
-    if (err != hipSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, hipGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
+
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));

  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -316,6 +296,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
  }
  recv->conn.tail = &resources->devMem->tail;
  recv->conn.head = &remDevMem->head;
+  recv->conn.ptrExchange = &remDevMem->ptrExchange;
  return ncclSuccess;
 }

@@ -323,6 +304,10 @@ ncclResult_t p2pSendFree(void* resources) {
  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
  if (sendRes->ipcPtr)
    CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
+  if (sendRes->remoteId != -1) {
+    NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
+    sendRes->devMem = NULL;
+  }
  CUDACHECK(hipFree(sendRes->devMem));
  free(sendRes);
  return ncclSuccess;
@@ -332,6 +317,10 @@ ncclResult_t p2pRecvFree(void* resources) {
  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
  if (recvRes->ipcPtr)
    CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
+  if (recvRes->remoteId != -1) {
+    NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
+    recvRes->devMem = NULL;
+  }
  CUDACHECK(hipFree(recvRes->devMem));
  free(recvRes);
  return ncclSuccess;
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #define MAX_SHM_NAME_LEN 1024

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {

  struct shmSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
@@ -81,7 +81,7 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  return ncclSuccess;
 }

-ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
  struct shmRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
@@ -106,7 +106,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }

 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -131,7 +131,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
  return ncclSuccess;
 }

-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -65,13 +65,25 @@ BEGIN {
    do {
      match($col_1, /\[([0-9]+)\]/, ary)
      chan=strtonum(ary[1])
-      match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
-      if(ary[8]!="-1")
-        treedns[ary[7] "," ary[8] "," chan]="1"
-      if(ary[9]!="-1")
-        treedns[ary[7] "," ary[9] "," chan]="1"
-      if(ary[10]!="-1")
-        treedns[ary[7] "," ary[10] "," chan]="1"
+      where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\|(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)/, ary)
+      if(where != 0) {
+        if(ary[8]!="-1")
+          treedns[ary[7] "," ary[8] "," chan]="1"
+        if(ary[9]!="-1")
+          treedns[ary[7] "," ary[9] "," chan]="1"
+        if(ary[10]!="-1")
+          treedns[ary[7] "," ary[10] "," chan]="1"
+      } else {
+        where = match($col_2, /(\-?[0-9]+)\/(\-?[0-9]+)\/(\-?[0-9]+)\->(\-?[0-9]+)\->(\-?[0-9]+)/, ary)
+        if(where != 0) {
+          if(ary[1]!="-1")
+            treedns[ary[4] "," ary[1] "," chan]="1"
+          if(ary[2]!="-1")
+            treedns[ary[4] "," ary[2] "," chan]="1"
+          if(ary[3]!="-1")
+            treedns[ary[4] "," ary[3] "," chan]="1"
+        }
+      }
      if(chan>max_treedn)
        max_treedn=chan
      col_1=col_1+2
@@ -31,6 +31,32 @@
 #include <list>
 #include <iterator>

+struct ibtestProxyArgs {
+  proxyProgressFunc_t progress;
+  struct ncclChannel* channel;
+  struct ncclConnector* connector;
+  int sliceSteps;
+  int chunkSteps;
+  int nsteps;
+  uint64_t opCount;
+  int protocol;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
+  int state;   // add component before this line -- it is left out during initialization
+
+  // Internal state
+  uint64_t head;
+  uint64_t tail;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  int idle;
+
+  // Element linking
+  pthread_mutex_t mutex;
+  struct ibtestProxyArgs* next;
+  struct ibtestProxyArgs* nextPeer;
+};
+
 ncclResult_t initNet();

 char* getCmdOption(char ** begin, char ** end, const std::string & option) {
@@ -204,7 +230,7 @@ private:
  bool runSend;
  bool use_gdr_read;
  int sliceSteps;
-  struct ncclProxyArgs args;
+  struct ibtestProxyArgs args;

  ncclResult_t connect(char* ip, uint16_t port) {
    inet_pton(AF_INET, ip, &netConnectAddr.sin_addr);
@@ -326,7 +352,7 @@ public:
  void launchKernel(uint64_t end) {
    *sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
    send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
-    memset(&args, 0, sizeof(struct ncclProxyArgs));
+    memset(&args, 0, sizeof(struct ibtestProxyArgs));
    args.head = 0;
    args.tail = 0;
    args.end = end;
@@ -365,7 +391,7 @@ private:
  bool runRecv;
  bool use_gdr_write;
  int sliceSteps;
-  struct ncclProxyArgs args;
+  struct ibtestProxyArgs args;

  ncclResult_t listen() {
    printf("GDR Write %s\n", use_gdr_write ? "enabled" : "disabled");
@@ -472,7 +498,7 @@ public:
          }
          args.head += args.sliceSteps;
          recv_byte += size;
-          NCCLCHECK(ncclNetFlush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
+          NCCLCHECK(ncclNetIflush(netRecvComm, localBuff+buffSlot*stepSize, size, mhandle, args.requests+buffSlot));
          STORE(recvHead, args.head);
          args.idle = 0;
        }
@@ -486,7 +512,7 @@ public:
  void launchKernel(uint64_t end) {
    *recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
    recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
-    memset(&args, 0, sizeof(struct ncclProxyArgs));
+    memset(&args, 0, sizeof(struct ibtestProxyArgs));
    args.head = 0;
    args.tail = 0;
    args.end = end;
--- a/Show More
+++ b/Show More