From 084207e685c4587e7d0aa2f1f7f148d3e0e68da6 Mon Sep 17 00:00:00 2001
From: xietingwew <xietingwen@meituan.com>
Date: Mon, 19 Oct 2020 11:20:23 +0800
Subject: [PATCH 01/25] fix proxyArgs for trace log

---
 src/enqueue.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/enqueue.cc b/src/enqueue.cc
index a8c0e41277..40e8f57107 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -410,7 +410,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
   proxyArgs->dtype = info->datatype;
   proxyArgs->redOp = info->op;
   TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
+      proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
       nLoops, proxyArgs->nsteps, info->comm);
   return ncclSuccess;
 }

From 920dbe5b359fe5817b8ba874476ca4ba2dc5f1ef Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Fri, 4 Sep 2020 14:35:05 -0700
Subject: [PATCH 02/25] 2.8.3-1

Optimization for Tree allreduce on A100.
Improve aggregation performance.
Use shared buffers for inter-node send/recv.
Add NVTX profiling hooks.
Accelerate alltoall connections by merging communication for all
channels.
Add support for one hop communication through NVLink, for faster
send/recv communication on cubemesh topologies like DGX-1.
Improve alltoall scheduling to better balance intra/inter node
communication.
Increase send/recv parallelism by 8x, each warp sending or
receiving to a different peer.
Net: move to v4.
Net: make flush operation asynchronous to accelerate alltoall.
Net: define maximum number of requests.
Fix hang when using LL128 protocol after 2^31 steps.
Fix #379 : topology injection failing when using less GPUs than
described in the XML.
Fix #394 : protocol mismatch causing hangs or crashes when using
one GPU per node.
---
 makefiles/common.mk                           |    5 +
 makefiles/version.mk                          |    4 +-
 pkg/debian/control.in                         |    4 +-
 pkg/redhat/nccl.spec.in                       |    6 +-
 src/bootstrap.cc                              |  460 ++--
 src/channel.cc                                |    4 +-
 src/collectives/all_gather.cc                 |    5 +-
 src/collectives/all_reduce.cc                 |    5 +-
 src/collectives/broadcast.cc                  |    5 +-
 src/collectives/device/all_gather.cu          |    4 +-
 src/collectives/device/all_gather.h           |  346 +--
 src/collectives/device/all_reduce.cu          |    4 +-
 src/collectives/device/all_reduce.h           | 1021 ++++----
 src/collectives/device/broadcast.cu           |    4 +-
 src/collectives/device/broadcast.h            |  255 +-
 src/collectives/device/common.h               |  205 +-
 src/collectives/device/common_kernel.h        |  184 +-
 src/collectives/device/functions.cu           |  102 +-
 src/collectives/device/primitives.h           |  332 ++-
 src/collectives/device/prims_ll.h             |    2 +-
 src/collectives/device/prims_ll128.h          |   10 +-
 src/collectives/device/reduce.cu              |    4 +-
 src/collectives/device/reduce.h               |  243 +-
 src/collectives/device/reduce_scatter.cu      |    4 +-
 src/collectives/device/reduce_scatter.h       |  305 +--
 src/collectives/device/sendrecv.cu            |    5 +-
 src/collectives/device/sendrecv.h             |  143 +-
 src/collectives/reduce.cc                     |    5 +-
 src/collectives/reduce_scatter.cc             |    5 +-
 src/collectives/sendrecv.cc                   |    6 +-
 src/debug.cc                                  |    6 +-
 src/enqueue.cc                                |  389 +--
 src/graph/connect.cc                          |  175 +-
 src/graph/paths.cc                            |   50 +-
 src/graph/rings.cc                            |    2 +-
 src/graph/search.cc                           |   60 +-
 src/graph/topo.cc                             |   32 +-
 src/graph/topo.h                              |   39 +-
 src/graph/trees.cc                            |   53 +-
 src/graph/tuning.cc                           |   99 +-
 src/graph/xml.cc                              |   44 +-
 src/graph/xml.h                               |   45 +-
 src/group.cc                                  |  190 +-
 src/include/bootstrap.h                       |    4 +-
 src/include/coll_net.h                        |    2 +-
 src/include/collectives.h                     |   81 +-
 src/include/comm.h                            |   19 +-
 src/include/core.h                            |    1 +
 src/include/cpuset.h                          |    4 +-
 src/include/devcomm.h                         |   77 +-
 src/include/enqueue.h                         |    2 +
 src/include/graph.h                           |   18 +-
 src/include/nccl_net.h                        |   27 +-
 src/include/net.h                             |    4 +-
 src/include/nvmlwrap.h                        |   16 +-
 src/include/nvtx.h                            |   14 +
 src/include/nvtx3.hpp                         | 2268 +++++++++++++++++
 src/include/nvtx3/nvToolsExt.h                | 1470 +++++++++++
 src/include/nvtx3/nvToolsExtCuda.h            |  141 +
 src/include/nvtx3/nvToolsExtCudaRt.h          |  117 +
 src/include/nvtx3/nvToolsExtOpenCL.h          |  191 ++
 src/include/nvtx3/nvToolsExtSync.h            |  382 +++
 src/include/nvtx3/nvtxDetail/nvtxImpl.h       |  438 ++++
 src/include/nvtx3/nvtxDetail/nvtxImplCore.h   |  307 +++
 .../nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h      |   81 +
 .../nvtx3/nvtxDetail/nvtxImplCuda_v3.h        |  102 +
 .../nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h      |  161 ++
 .../nvtx3/nvtxDetail/nvtxImplSync_v3.h        |   83 +
 src/include/nvtx3/nvtxDetail/nvtxInit.h       |  312 +++
 src/include/nvtx3/nvtxDetail/nvtxInitDecls.h  |   81 +
 src/include/nvtx3/nvtxDetail/nvtxInitDefs.h   |  573 +++++
 src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h   |   83 +
 src/include/nvtx3/nvtxDetail/nvtxTypes.h      |  304 +++
 src/include/p2p.h                             |   39 +-
 src/include/param.h                           |    5 +-
 src/include/proxy.h                           |   34 +-
 src/include/socket.h                          |   21 +-
 src/include/transport.h                       |    7 +-
 src/include/trees.h                           |    6 +-
 src/init.cc                                   |  190 +-
 src/misc/argcheck.cc                          |   10 +-
 src/misc/nvmlwrap.cc                          |   56 +-
 src/proxy.cc                                  |  509 +++-
 src/transport.cc                              |  118 +-
 src/transport/coll_net.cc                     |  305 +--
 src/transport/net.cc                          |  435 ++--
 src/transport/net_ib.cc                       |  157 +-
 src/transport/net_socket.cc                   |   29 +-
 src/transport/p2p.cc                          |  218 +-
 src/transport/shm.cc                          |    8 +-
 90 files changed, 11172 insertions(+), 3209 deletions(-)
 create mode 100644 src/include/nvtx.h
 create mode 100644 src/include/nvtx3.hpp
 create mode 100644 src/include/nvtx3/nvToolsExt.h
 create mode 100644 src/include/nvtx3/nvToolsExtCuda.h
 create mode 100644 src/include/nvtx3/nvToolsExtCudaRt.h
 create mode 100644 src/include/nvtx3/nvToolsExtOpenCL.h
 create mode 100644 src/include/nvtx3/nvToolsExtSync.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImpl.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImplCore.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxInit.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxTypes.h

diff --git a/makefiles/common.mk b/makefiles/common.mk
index 8e91a45f69..d4c353b94a 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -11,6 +11,7 @@ KEEP ?= 0
 DEBUG ?= 0
 TRACE ?= 0
 PROFAPI ?= 0
+NVTX ?= 1
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
 CXXFLAGS  += -DENABLE_TRACE
 endif
 
+ifeq ($(NVTX), 0)
+CXXFLAGS  += -DNVTX_DISABLE
+endif
+
 ifneq ($(KEEP), 0)
 NVCUFLAGS += -keep
 endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 8142428a73..f2539c595b 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 7
-NCCL_PATCH   := 8
+NCCL_MINOR   := 8
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/pkg/debian/control.in b/pkg/debian/control.in
index c8f5266008..22c60f8785 100644
--- a/pkg/debian/control.in
+++ b/pkg/debian/control.in
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
 Section: libs
 Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: NVIDIA Collectives Communication Library (NCCL) Runtime
+Description: NVIDIA Collective Communication Library (NCCL) Runtime
  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
  broadcast, and reduce-scatter.
@@ -21,7 +21,7 @@ Package: libnccl-dev
 Section: libdevel
 Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
-Description: NVIDIA Collectives Communication Library (NCCL) Development Files
+Description: NVIDIA Collective Communication Library (NCCL) Development Files
  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
  communication routines for GPUs, implementing all-reduce, all-gather, reduce,
  broadcast, and reduce-scatter.
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index 5fad346f4a..f1cce5c3bd 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -1,7 +1,7 @@
 Name:           libnccl
 Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
 Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 
 Group:          Development/Libraries
 License:        BSD
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
 sockets.
 
 %package devel
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 Group:          Development/Libraries
 %description devel
 NCCL development files
 
 %package static
-Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 Group:          Development/Libraries
 %description static
 NCCL static library
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index e90dd66823..bd6ec99c6c 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -13,144 +13,77 @@
 #include <unistd.h>
 #include <sys/types.h>
 
-struct bootstrapNetComm {
-  int fd;
-};
-
 /* Init functions */
-static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
-static int bootstrapNetIfs = -1;
+static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
+static union socketAddress bootstrapNetIfAddr;
+static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
 
 ncclResult_t bootstrapNetInit() {
-  if (bootstrapNetIfs == -1) {
+  if (bootstrapNetInitDone == 0) {
     pthread_mutex_lock(&bootstrapNetLock);
-    if (bootstrapNetIfs == -1) {
-      bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      if (bootstrapNetIfs <= 0) {
-        WARN("Bootstrap : no socket interface found");
-        return ncclInternalError;
-      } else {
-        char line[1024];
-        char addrline[1024];
-        line[0] = '\0';
-        for (int i=0; i<bootstrapNetIfs; i++) {
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
-              socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+    if (bootstrapNetInitDone == 0) {
+      char* env = getenv("NCCL_COMM_ID");
+      if (env) {
+        union socketAddress remoteAddr;
+        if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+          return ncclInvalidArgument;
+        }
+        if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+          WARN("NET/Socket : No usable listening interface found");
+          return ncclSystemError;
+        }
+      } else {
+        int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+        if (nIfs <= 0) {
+          WARN("Bootstrap : no socket interface found");
+          return ncclInternalError;
         }
-        line[1023] = '\0';
-        INFO(NCCL_INIT, "Bootstrap : Using%s", line);
       }
+      char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
+      sprintf(line, " %s:", bootstrapNetIfName);
+      socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
+      INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      bootstrapNetInitDone = 1;
     }
     pthread_mutex_unlock(&bootstrapNetLock);
   }
   return ncclSuccess;
 }
 
-static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
-  if (dev >= bootstrapNetIfs) return ncclInternalError;
-  memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
-  return ncclSuccess;
-}
-
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
 
-static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
-  // if dev >= 0, listen based on dev
-  if (dev >= 0) {
-    NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
-  } else if (dev == findSubnetIf) {
-    // handle stores a remote address
-    // need to find a local addr that is in the same network as the remote addr
-    union socketAddress localAddr;
-    char ifName[MAX_IF_NAME_SIZE];
-    if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("NET/Socket : No usable listening interface found");
-      return ncclSystemError;
-    }
-    // pass the local address back
-    memcpy(connectAddr, &localAddr, sizeof(localAddr));
-  } // Otherwise, handle stores a local address
-  struct bootstrapNetComm* comm;
-  NCCLCHECK(bootstrapNetNewComm(&comm));
-  NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
-  *listenComm = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  struct bootstrapNetComm* comm;
-  NCCLCHECK(bootstrapNetNewComm(&comm));
-  NCCLCHECK(connectAddress(&comm->fd, connectAddr));
-  *sendComm = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
-  struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
-  struct bootstrapNetComm* rComm;
-  NCCLCHECK(bootstrapNetNewComm(&rComm));
+static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
   struct sockaddr_in sockaddr;
   socklen_t socklen = sizeof(struct sockaddr_in);
-  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
-  *recvComm = rComm;
+  SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
   return ncclSuccess;
 }
 
-static ncclResult_t bootstrapNetClose(void* opaqueComm) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
-  if (comm) {
-    close(comm->fd);
-    free(comm);
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
-
 // Additional sync functions
-static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
-  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
-  NCCLCHECK(socketSend(comm->fd, data, size));
+static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
+  NCCLCHECK(socketSend(fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(fd, data, size));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
-  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
   int recvSize;
-  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
   if (recvSize > size) {
     WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
     return ncclInternalError;
   }
-  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
-  union socketAddress* connectAddr = (union socketAddress*) netHandle;
-  NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
+  NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
   return ncclSuccess;
 }
 
 struct extInfo {
   int rank;
   int nranks;
-  ncclNetHandle_t extHandleListenRoot;
-  ncclNetHandle_t extHandleListen;
+  union socketAddress extAddressListenRoot;
+  union socketAddress extAddressListen;
 };
 
 #include <sys/resource.h>
@@ -163,27 +96,29 @@ static ncclResult_t setFilesLimit() {
   return ncclSuccess;
 }
 
-static void *bootstrapRoot(void* listenComm) {
+static void *bootstrapRoot(void* args) {
+  int listenFd = (uint64_t)args;
+  ncclResult_t res = ncclSuccess;
+  int nranks = 0, c = 0;
   struct extInfo info;
-  ncclNetHandle_t *rankHandles = NULL;
-  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t zero = { 0 }; // for sanity checking
-  void* tmpComm;
-  ncclResult_t res;
+  union socketAddress *rankAddresses = NULL;
+  union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  union socketAddress *zero = NULL;
+  NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
   setFilesLimit();
 
   TRACE(NCCL_INIT, "BEGIN");
   /* Receive addresses from all ranks */
-  int nranks = 0, c = 0;
   do {
-    NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
+    int tmpFd;
+    NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
+    close(tmpFd);
 
     if (c == 0) {
       nranks = info.nranks;
-      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
-      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
     }
 
     if (nranks != info.nranks) {
@@ -191,14 +126,14 @@ static void *bootstrapRoot(void* listenComm) {
       goto out;
     }
 
-    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
 
     // Save the connection handle for that rank
-    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
-    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
+    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
 
     ++c;
     TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
@@ -208,44 +143,46 @@ static void *bootstrapRoot(void* listenComm) {
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
-    void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
+    int tmpSendFd;
+    NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
+    close(tmpSendFd);
   }
   TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
 
 out:
-  bootstrapNetCloseListen(listenComm);
-  if (rankHandles) free(rankHandles);
-  if (rankHandlesRoot) free(rankHandlesRoot);
+  close(listenFd);
+  if (rankAddresses) free(rankAddresses);
+  if (rankAddressesRoot) free(rankAddressesRoot);
+  if (zero) free(zero);
 
   TRACE(NCCL_INIT, "DONE");
   return NULL;
 }
 
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
-  void* listenComm;
-  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
+  union socketAddress* connectAddr = (union socketAddress*) id;
+  int listenFd;
+  NCCLCHECK(createListenSocket(&listenFd, connectAddr));
   pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, listenComm);
+  pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
-  static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
   memset(id, 0, sizeof(ncclUniqueId));
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  union socketAddress* connectAddr = (union socketAddress*) id;
 
   char* env = getenv("NCCL_COMM_ID");
   if (env) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    if (bootstrapNetCreateHandle(netHandle, env) != 0) {
+    if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
       WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
       return ncclInvalidArgument;
     }
   } else {
+    memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
     NCCLCHECK(bootstrapCreateRoot(id, false));
   }
 
@@ -254,24 +191,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
 
 struct unexConn {
   int peer;
-  void* comm;
+  int fd;
   struct unexConn* next;
 };
 
-struct extState {
-  void* extBstrapListenComm;
-  void* extBstrapRingRecvComm;
-  void* extBstrapRingSendComm;
-  ncclNetHandle_t* peerBstrapHandles;
-  struct unexConn* unexpectedConnections;
-  int rank;
-  int nranks;
-  int dev;
+// Remote allocator state
+struct remAllocState {
+  int cudaDev;
+  int listenFd;
+  int stop;
 };
 
+struct extState {
+  int extListenFd;
+  int extRingRecvFd;
+  int extRingSendFd;
+  union socketAddress* peerCommAddresses;
+  union socketAddress* peerAllocAddresses;
+  struct unexConn* unexpectedConnections;
+  int cudaDev;
+  int rank;
+  int nranks;
+
+  // Intermediate memory allocation service
+  struct remAllocState* allocState;
+  pthread_t allocThread;
+};
+
+#define MAX_SEGMENTS 128
+
+static ncclResult_t remoteAlloc(void** ptr, int fd) {
+  size_t size;
+  NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
+  cudaIpcMemHandle_t devIpc;
+  NCCLCHECK(ncclCudaCalloc((char**)ptr, size));
+  cudaError_t res = cudaIpcGetMemHandle(&devIpc, *ptr);
+  if (res != cudaSuccess) {
+    WARN("[Rem Allocator] cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+    cudaFree(*ptr);
+    CUDACHECK(res);
+  }
+  // The CUDA IPC
+  NCCLCHECK(socketSend(fd, &devIpc, sizeof(cudaIpcMemHandle_t)));
+  // And the direct pointer
+  NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
+  return ncclSuccess;
+}
+
+#include <poll.h>
+
+// Service thread to allocate memory for other GPUs, used as intermediate step.
+void* ncclRemoteMemAllocationService(void* args) {
+  struct remAllocState* state = (struct remAllocState *) args;
+  if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
+    WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
+  }
+
+  // Prepare poll descriptor
+  void* segments[MAX_SEGMENTS];
+  struct pollfd pollfds[MAX_SEGMENTS+1];
+  for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
+  for (int s=0; s<MAX_SEGMENTS; s++) {
+    pollfds[s].fd = -1;
+    pollfds[s].events = POLLHUP;
+  }
+  pollfds[MAX_SEGMENTS].fd = state->listenFd;
+  pollfds[MAX_SEGMENTS].events = POLLIN;
+
+  int nbuffers = 0;
+  while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
+    if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
+      WARN("[Rem Allocator] Poll failed with error %d", error);
+      return NULL;
+    }
+    if (pollfds[MAX_SEGMENTS].revents) {
+      int s = 0;
+      while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
+      if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
+        pollfds[s].fd = -1;
+      } else {
+        if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
+          WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
+          close(pollfds[s].fd);
+          pollfds[s].fd = -1;
+        } else {
+          nbuffers++;
+        }
+      }
+    }
+    for (int s=0; s<MAX_SEGMENTS; s++) {
+      if (pollfds[s].revents & POLLHUP) {
+        if (cudaFree(segments[s]) != cudaSuccess) {
+          WARN("[Rem Allocator] cudaFree %p failed", segments[s]);
+        }
+        segments[s] = NULL;
+        close(pollfds[s].fd);
+        pollfds[s].fd = -1;
+        nbuffers--;
+      }
+    }
+  }
+  for (int s=0; s<MAX_SEGMENTS; s++) {
+    if (segments[s]) cudaFree(segments[s]);
+    close(pollfds[s].fd);
+  }
+  close(state->listenFd);
+  free(state);
+  return NULL;
+}
+
+ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr) {
+  struct extState* state = (struct extState*)commState;
+  int fd;
+  ncclResult_t res;
+  *id = -1;
+  NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
+  NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
+  NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(cudaIpcMemHandle_t)), res, end);
+  NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
+  *id = fd;
+end:
+  return res;
+}
+
+ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
+  SYSCHECK(close(id), "close");
+  return ncclSuccess;
+}
+
 ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
-  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
-  bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
   struct extState* state;
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
@@ -283,19 +331,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
   struct extInfo info = { 0 };
   info.rank = rank;
   info.nranks = nranks;
-  void *tmpSendComm, *tmpRecvComm;
-  // Pass the remote address to listen via info
-  if (idFromEnv) {
-    memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
-  }
-  // listen will return the local address via info (specify interface type 'findSubnetIf')
-  state->dev = idFromEnv ? findSubnetIf : 0;
-  void* extBstrapListenCommRoot;
-  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
-  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+  int tmpSendFd, tmpRecvFd;
 
-  // stagger connection times to avoid an overload of the root at very high rank counts
+  int extListenFdRoot;
+  memcpy(&info.extAddressListen,     &bootstrapNetIfAddr, sizeof(union socketAddress));
+  memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
+  NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
+  NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
+
+  // stagger connection times to avoid an overload of the root
   if (nranks > 128) {
     long msec = rank;
     struct timespec tv;
@@ -306,25 +350,35 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
   }
 
   // send info on my listening socket to root
-  NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  union socketAddress* rootAddr = (union socketAddress*)id;
+  NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
+  close(tmpSendFd);
 
   // get info on my "next" rank in the bootstrap ring from root
-  ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
-  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
-  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
+  union socketAddress extAddressNext;
+  NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
+  close(tmpRecvFd);
+  close(extListenFdRoot);
 
-  NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
   // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+  NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));
 
   // AllGather all listen handlers
-  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
-  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
-  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+  NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
+  memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
+
+  // Create the memory allocation service
+  NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
+  memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
+  NCCLCHECK(ncclCalloc(&state->allocState, 1));
+  CUDACHECK(cudaGetDevice(&state->allocState->cudaDev));
+  NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
+  pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
+  NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
 
   TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
@@ -348,9 +402,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
     size_t sslice = (rank - i + nranks) % nranks;
 
     // Send slice to the right
-    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
     // Recv slice from the left
-    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -359,20 +413,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
   struct extState* state = (struct extState*)commState;
-  void* tmpSendComm;
-  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
-  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  int tmpSendFd;
+  NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
+  close(tmpSendFd);
   return ncclSuccess;
 }
 
-ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
   // New unex
   struct unexConn* unex;
   NCCLCHECK(ncclCalloc(&unex, 1));
   unex->peer = peer;
-  unex->comm = comm;
+  unex->fd = fd;
 
   // Enqueue
   struct unexConn* list = state->unexpectedConnections;
@@ -385,7 +439,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
   return ncclSuccess;
 }
 
-void* unexpectedDequeue(struct extState* state, int peer) {
+int unexpectedDequeue(struct extState* state, int peer) {
   struct unexConn* elem = state->unexpectedConnections;
   struct unexConn* prev = NULL;
   while (elem) {
@@ -395,41 +449,41 @@ void* unexpectedDequeue(struct extState* state, int peer) {
       } else {
         prev->next = elem->next;
       }
-      void* comm = elem->comm;
+      int fd = elem->fd;
       free(elem);
-      return comm;
+      return fd;
     }
     prev = elem;
     elem = elem->next;
   }
-  return NULL;
+  return -1;
 }
 
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
   struct extState* state = (struct extState*)commState;
 
-  void* tmpRecvComm;
+  int tmpRecvFd;
 
   // Search unexpected connections first
-  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
-    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
-    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
+    close(tmpRecvFd);
     return ncclSuccess;
   }
 
   // Then look for new connections
   while (1) {
-    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
     int newPeer;
-    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
     if (newPeer == peer) {
-      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
-      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
+      close(tmpRecvFd);
       return ncclSuccess;
     }
     // Unexpected connection. Save for later.
-    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
   }
 }
 
@@ -439,11 +493,17 @@ ncclResult_t bootstrapClose(void* commState) {
     WARN("Unexpected connections are not empty.\n");
     return ncclInternalError;
   }
-  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
-  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
+  close(state->extListenFd);
+  close(state->extRingSendFd);
+  close(state->extRingRecvFd);
 
-  free(state->peerBstrapHandles);
+  state->allocState->stop = 1;
+
+  // Join the allocThread so we catch resource leaks as being hung here
+  // pthread_join(state->allocThread, nullptr);
+
+  free(state->peerCommAddresses);
+  free(state->peerAllocAddresses);
   free(state);
 
   return ncclSuccess;
@@ -451,10 +511,12 @@ ncclResult_t bootstrapClose(void* commState) {
 
 ncclResult_t bootstrapAbort(void* commState) {
   struct extState* state = (struct extState*)commState;
-  bootstrapNetCloseListen(state->extBstrapListenComm);
-  bootstrapNetCloseSend(state->extBstrapRingSendComm);
-  bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
-  free(state->peerBstrapHandles);
+  close(state->extListenFd);
+  close(state->extRingSendFd);
+  close(state->extRingRecvFd);
+  state->allocState->stop = 2;
+  free(state->peerCommAddresses);
+  free(state->peerAllocAddresses);
   free(state);
   return ncclSuccess;
 }
diff --git a/src/channel.cc b/src/channel.cc
index d22ea63a2e..b2cc5b71fb 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -25,14 +25,14 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
   }
 
   // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
+  NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
   return ncclSuccess;
 }
 
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   if (channel->id == -1) return ncclSuccess;
   // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+  NCCLCHECK(ncclCudaHostFree(channel->workFifo));
 
   // Free Ring index to rank tables
   free(channel->ring.userRanks);
diff --git a/src/collectives/all_gather.cc b/src/collectives/all_gather.cc
index 348c176001..266fd5a838 100644
--- a/src/collectives/all_gather.cc
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
     ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
   return ncclEnqueueCheck(&info);
diff --git a/src/collectives/all_reduce.cc b/src/collectives/all_reduce.cc
index 7796d5b064..b67f3be82e 100644
--- a/src/collectives/all_reduce.cc
+++ b/src/collectives/all_reduce.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +10,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
     ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
   return ncclEnqueueCheck(&info);
diff --git a/src/collectives/broadcast.cc b/src/collectives/broadcast.cc
index 042301b376..db0fb495d3 100644
--- a/src/collectives/broadcast.cc
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
     BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
   return ncclEnqueueCheck(&info);
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 109c3416c3..4022e2e9f5 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
+IMPL_COLL_C(AllGather);
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index f7556b0c3f..e057dc8646 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -8,197 +8,201 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads-WARP_SIZE;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(realChunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(realChunkSize, size-chunkOffset);
+        int rankDest;
 
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
 
-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      prims.directSend(thisInput+chunkOffset, offset, nelem);
-    } else {
-      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          prims.directSend(thisInput+chunkOffset, offset, nelem);
+        } else {
+          prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+        }
+
+        // Make final copy from buffer to dest.
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        // Final wait/copy.
+        prims.directRecv(thisOutput+offset, offset, nelem);
+      }
     }
+};
 
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
 
-      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;
+
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          LLprims.send(thisInput+chunkOffset, nelem);
+        } else {
+          LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+
+        // step k-1: final store
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recv(thisOutput+offset, nelem);
+      }
     }
-
-    // Make final copy from buffer to dest.
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    // Final wait/copy.
-    prims.directRecv(thisOutput+offset, offset, nelem);
-  }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
-    }
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LLprims.send(thisInput+chunkOffset, nelem);
-    } else {
-      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
-    }
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
-
-    // step k-1: final store
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+};
 
 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
 
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
 
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;
 
-    /////////////// begin AllGather steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin AllGather steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;
 
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
 
-    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LLprims.send(thisInput+chunkOffset, nelem);
-    } else {
-      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        if (thisInput + chunkOffset == thisOutput + offset) { // In place
+          LLprims.send(thisInput+chunkOffset, nelem);
+        } else {
+          LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j=1; j<nranks-1; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+
+        // step k-1: final store
+        rankDest = ring->devUserRanks[1];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recv(thisOutput+offset, nelem);
+      }
     }
+};
 
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
 
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
 
-    // step k-1: final store
-    rankDest = ring->devUserRanks[1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index 85d007e806..e7c3c28cfb 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
+IMPL_COLL_R(AllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index d4eee03c8e..fe2e6fc541 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -8,516 +8,442 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads-WARP_SIZE;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclRing* ring = &channel->ring;
+    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+    const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+    const int nranks = comm->nRanks;
+    const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+    const ssize_t size = args->coll.count;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
+    ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
+      prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
-    ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
+      ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
+      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+      ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
 
-    /////////////// begin AllReduce steps ///////////////
-    ssize_t offset;
-    int nelem;
-    int chunk;
+      /////////////// begin AllReduce steps ///////////////
+      ssize_t offset;
+      int nelem;
+      int chunk;
 
-    // step 0: push data to next GPU
-    chunk = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + chunk * realChunkSize;
-    nelem = min(realChunkSize, size-offset);
-
-    prims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
+      // step 0: push data to next GPU
+      chunk = ring->devUserRanks[nranks-1];
       offset = chunkOffset + chunk * realChunkSize;
       nelem = min(realChunkSize, size-offset);
 
-      prims.recvReduceSend(thisInput+offset, nelem);
-    }
+      prims.send(thisInput+offset, nelem);
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    chunk = ring->devUserRanks[0];
-    offset = chunkOffset + chunk * realChunkSize;
-    nelem = min(realChunkSize, size-offset);
+      // k-2 steps: reduce and copy to next GPU
+      for (int j=2; j<nranks; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + chunk * realChunkSize;
+        nelem = min(realChunkSize, size-offset);
 
-    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + chunk * realChunkSize;
-      nelem = min(realChunkSize, size-offset);
-
-      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
-    }
-
-    // Make final copy from buffer to dest.
-    chunk = ring->devUserRanks[1];
-    offset = chunkOffset + chunk * realChunkSize;
-    nelem = min(realChunkSize, size-offset);
-
-    // Final wait/copy.
-    prims.directRecv(thisOutput+offset, offset, nelem);
-  }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  int chunkSize = args->coll.lastChunkSize;
-  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  if (loopSize > size) {
-    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
-  }
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  do {
-    struct ncclTree* tree = &channel->treeUp;
-    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Up
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        prims.send(thisInput+offset, nelem);
-      } else {
         prims.recvReduceSend(thisInput+offset, nelem);
       }
-    }
-  } while(0);
 
-  do {
-    struct ncclTree* tree = &channel->treeDn;
-    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Down
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        prims.directSend(thisOutput+offset, offset, nelem);
-      } else if (tree->down[0] == -1) {
-        prims.directRecv(thisOutput+offset, offset, nelem);
-      } else {
+      // step k-1: reduce this buffer and data, which will produce the final
+      // result that we store in this data and push to the next GPU
+      chunk = ring->devUserRanks[0];
+      offset = chunkOffset + chunk * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
+
+      prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+
+      // k-2 steps: copy to next GPU
+      for (int j=1; j<nranks-1; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + chunk * realChunkSize;
+        nelem = min(realChunkSize, size-offset);
+
         prims.directRecvCopySend(thisOutput+offset, offset, nelem);
       }
-    }
-  } while(0);
-}
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  int chunkSize = args->coll.lastChunkSize;
-  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+      // Make final copy from buffer to dest.
+      chunk = ring->devUserRanks[1];
+      offset = chunkOffset + chunk * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
 
-  if (loopSize > size) {
-    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
-  }
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  if (blockIdx.x < nChannels) { // first half of the channels do reduce
-    struct ncclTree* tree = &channel->collTreeUp;
-    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Up
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        prims.send(thisInput+offset, nelem);
-      } else {
-        prims.recvReduceSend(thisInput+offset, nelem);
-      }
+      // Final wait/copy.
+      prims.directRecv(thisOutput+offset, offset, nelem);
     }
   }
+};
 
-  if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
-    struct ncclTree* tree = &channel->collTreeDn;
-    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Down
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        prims.send(thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        prims.recv(thisOutput+offset, nelem);
-      } else {
-        prims.recvCopySend(thisOutput+offset, nelem);
-      }
-    }
-  }
-}
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads-2*WARP_SIZE;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclTree* tree = &channel->tree;
+    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+    int chunkSize = args->coll.lastChunkSize;
+    const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->coll.count;
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*nranks*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
-    /////////////// begin AllReduce steps ///////////////
-    ssize_t offset;
-    int nelem;
-    int chunk;
-
-    // step 0: push data to next GPU
-    chunk = ring->devUserRanks[nranks-1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    LLprims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvReduceSend(thisInput+offset, nelem);
+    if (loopSize > size) {
+      chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
     }
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    chunk = ring->devUserRanks[0];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
-
-    // Make final copy from buffer to dest.
-    chunk = ring->devUserRanks[1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    // Here we need to copy from buffer to this output.
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  if (loopSize > size) {
-    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
-  }
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  do {
-    struct ncclTree* tree = &channel->treeUp;
-    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Up
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.recvReduceSend(thisInput+offset, nelem);
-      }
-    }
-  } while(0);
-
-  do {
-    struct ncclTree* tree = &channel->treeDn;
-    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Down
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        LLprims.send(thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        LLprims.recv(thisOutput+offset, nelem);
-      } else {
-        LLprims.recvCopySend(thisOutput+offset, nelem);
-      }
-    }
-  } while(0);
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  if (loopSize > size) {
-    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
-  }
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  if (blockIdx.x < nChannels) { // first half of the channels do reduce
-    struct ncclTree* tree = &channel->collTreeUp;
-    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Up
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.recvReduceSend(thisInput+offset, nelem);
-      }
-    }
-  }
-
-  if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
-    struct ncclTree* tree = &channel->collTreeDn;
-    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      // Down
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      if (tree->up == -1) {
-        LLprims.send(thisOutput+offset, nelem);
-      } else if (tree->down[0] == -1) {
-        LLprims.recv(thisOutput+offset, nelem);
-      } else {
-        LLprims.recvCopySend(thisOutput+offset, nelem);
-      }
-    }
-  }
-}
-
-#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*nranks*chunkSize;
-  const ssize_t size = args->coll.count;
-
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
-    /////////////// begin AllReduce steps ///////////////
-    ssize_t offset;
-    int nelem;
-    int chunk;
-
-    // step 0: push data to next GPU
-    chunk = ring->devUserRanks[nranks-1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    LLprims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    chunk = ring->devUserRanks[0];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
-
-    // Make final copy from buffer to dest.
-    chunk = ring->devUserRanks[1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    // Here we need to copy from buffer to this output.
-    LLprims.recv(thisOutput+offset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclTree* treeUp = &channel->treeUp;
-  struct ncclTree* treeDn = &channel->treeDn;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = args->coll.lastChunkSize;
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
-  const ssize_t loopSize = nChannels*chunkSize;
-  int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
-  const ssize_t size = args->coll.count;
-
-  if (loopSize > size) {
-    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
-  }
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  if (treeUp->up == -1) {
-    // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
-    ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm);
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      ssize_t offset = gridOffset + bid*chunkSize;
-      int nelem = min(chunkSize, size-offset);
-      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
-    }
-  } else {
-    if (tid < nthreadsSplit) {
+#if 1
+    if (tid < nthreads+WARP_SIZE) {
       // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-      ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
+        prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         // Up
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
-        if (treeUp->down[0] == -1) {
+        if (tree->up == -1) {
+          prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
+          prims.send(thisInput+offset, nelem);
+        } else {
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
+    }
+
+    if (tid < nthreads+WARP_SIZE) {
+      // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+      ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
+        prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Down
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          prims.directSend(thisOutput+offset, offset, nelem);
+        } else if (tree->down[0] == -1) {
+          prims.directRecv(thisOutput+offset, offset, nelem);
+        } else {
+          prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+        }
+      }
+    }
+#else
+    int nthreadsSplit = nthreads/2;
+    if (nthreadsSplit == 256) nthreadsSplit += 64;
+    if (tree->up == -1) {
+      if (tid < nthreads+WARP_SIZE) {
+        // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+        ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY, 1, FUNC>
+          prims(tid, nthreads, tree->down, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+        }
+      }
+    } else {
+      if (tid < nthreadsSplit + WARP_SIZE) {
+        // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+        ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
+          prims(tid, nthreadsSplit, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          // Up
+          ssize_t offset = gridOffset + bid*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          if (tree->down[0] == -1) {
+            prims.send(thisInput+offset, nelem);
+          } else {
+            prims.recvReduceSend(thisInput+offset, nelem);
+          }
+        }
+      } else {
+        // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+        ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
+          prims(tid-nthreadsSplit-WARP_SIZE, nthreads-nthreadsSplit, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 2);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          // Down
+          ssize_t offset = gridOffset + bid*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          if (tree->down[0] == -1) {
+            prims.directRecv(thisOutput+offset, offset, nelem);
+          } else {
+            prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+          }
+        }
+      }
+    }
+#endif
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads-WARP_SIZE;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclTree* tree = &channel->collTree;
+    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+    int chunkSize = args->coll.lastChunkSize;
+    const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->coll.count;
+
+    if (loopSize > size) {
+      chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+    }
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    if (blockIdx.x < nChannels) { // first half of the channels do reduce
+      ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Up
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
+          prims.send(thisInput+offset, nelem);
+        } else {
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
+    }
+
+    if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
+      ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Down
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          prims.send(thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
+          prims.recv(thisOutput+offset, nelem);
+        } else {
+          prims.recvCopySend(thisOutput+offset, nelem);
+        }
+      }
+    }
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclRing* ring = &channel->ring;
+    const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+    ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+    const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+    const int nranks = comm->nRanks;
+    const ssize_t loopSize = nChannels*nranks*chunkSize;
+    const ssize_t size = args->coll.count;
+
+    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+      /////////////// begin AllReduce steps ///////////////
+      ssize_t offset;
+      int nelem;
+      int chunk;
+
+      // step 0: push data to next GPU
+      chunk = ring->devUserRanks[nranks-1];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.send(thisInput+offset, nelem);
+
+      // k-2 steps: reduce and copy to next GPU
+      for (int j=2; j<nranks; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+        nelem = min(chunkSize, size-offset);
+
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+
+      // step k-1: reduce this buffer and data, which will produce the final
+      // result that we store in this data and push to the next GPU
+      chunk = ring->devUserRanks[0];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+      // k-2 steps: copy to next GPU
+      for (int j=1; j<nranks-1; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+        nelem = min(chunkSize, size-offset);
+
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+
+      // Make final copy from buffer to dest.
+      chunk = ring->devUserRanks[1];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      LLprims.recv(thisOutput+offset, nelem);
+    }
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclTree* tree = &channel->tree;
+    const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+    ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+    const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->coll.count;
+
+    if (loopSize > size) {
+      chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+    }
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    do {
+      // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+      ncclLLPrimitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Up
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
           LLprims.send(thisInput+offset, nelem);
         } else {
           LLprims.recvReduceSend(thisInput+offset, nelem);
         }
       }
-    } else {
+    } while(0);
+
+    do {
       // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-      ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm);
+      ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         // Down
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
-        if (treeDn->down[0] == -1) {
+        if (tree->up == -1) {
+          LLprims.send(thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
+          LLprims.recv(thisOutput+offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+      }
+    } while(0);
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclTree* tree = &channel->collTree;
+    const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+    ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+    const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->coll.count;
+
+    if (loopSize > size) {
+      chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+    }
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    if (blockIdx.x < nChannels) { // first half of the channels do reduce
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Up
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
+          LLprims.send(thisInput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
+    }
+
+    if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Down
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (tree->up == -1) {
+          LLprims.send(thisOutput+offset, nelem);
+        } else if (tree->down[0] == -1) {
           LLprims.recv(thisOutput+offset, nelem);
         } else {
           LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -525,7 +451,154 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
       }
     }
   }
-}
+};
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
+#include "prims_ll128.h"
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclRing* ring = &channel->ring;
+    const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+    ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+    const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+    const int nranks = comm->nRanks;
+    const ssize_t loopSize = nChannels*nranks*chunkSize;
+    const ssize_t size = args->coll.count;
+
+    ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+      /////////////// begin AllReduce steps ///////////////
+      ssize_t offset;
+      int nelem;
+      int chunk;
+
+      // step 0: push data to next GPU
+      chunk = ring->devUserRanks[nranks-1];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.send(thisInput+offset, nelem);
+
+      // k-2 steps: reduce and copy to next GPU
+      for (int j=2; j<nranks; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+        nelem = min(chunkSize, size-offset);
+
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+
+      // step k-1: reduce this buffer and data, which will produce the final
+      // result that we store in this data and push to the next GPU
+      chunk = ring->devUserRanks[0];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+      // k-2 steps: copy to next GPU
+      for (int j=1; j<nranks-1; ++j) {
+        chunk = ring->devUserRanks[nranks-j];
+        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+        nelem = min(chunkSize, size-offset);
+
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+
+      // Make final copy from buffer to dest.
+      chunk = ring->devUserRanks[1];
+      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      LLprims.recv(thisOutput+offset, nelem);
+    }
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int bid = args->coll.bid;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclTree* tree = &channel->tree;
+    const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+    ssize_t chunkSize = args->coll.lastChunkSize;
+    const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
+    const ssize_t loopSize = nChannels*chunkSize;
+    int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+    const ssize_t size = args->coll.count;
+
+    if (loopSize > size) {
+      chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+    }
+
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    if (tree->up == -1) {
+      // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+      ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, tree->down, tree->down, stepSize, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+      }
+    } else {
+      if (tid < nthreadsSplit) {
+        // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+        ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreadsSplit, tree->down, &tree->up, stepSize, channel, comm);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          // Up
+          ssize_t offset = gridOffset + bid*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          if (tree->down[0] == -1) {
+            LLprims.send(thisInput+offset, nelem);
+          } else {
+            LLprims.recvReduceSend(thisInput+offset, nelem);
+          }
+        }
+      } else {
+        // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+        ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, stepSize, channel, comm);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          // Down
+          ssize_t offset = gridOffset + bid*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          if (tree->down[0] == -1) {
+            LLprims.recv(thisOutput+offset, nelem);
+          } else {
+            LLprims.recvCopySend(thisOutput+offset, nelem);
+          }
+        }
+      }
+    }
+  }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+__device__ void run(struct ncclWorkElem* args) { }
+};
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index 8c8dbb602e..77595858bf 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
+IMPL_COLL_C(Broadcast);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index a4c30d235a..72216acaf5 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -8,152 +8,155 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads-WARP_SIZE;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*realChunkSize;
-    int nelem = min(realChunkSize, size-offset);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
 
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        prims.send(thisInput+offset, nelem);
-      } else {
-        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            prims.send(thisInput+offset, nelem);
+          } else {
+            prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+          }
+        } else if (nextRank == root) {
+          prims.recv(thisOutput+offset, nelem);
+        } else {
+          prims.recvCopySend(thisOutput+offset, nelem);
+        }
       }
-    } else if (nextRank == root) {
-      prims.recv(thisOutput+offset, nelem);
-    } else {
-      prims.recvCopySend(thisOutput+offset, nelem);
     }
-  }
-}
+};
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t offset = gridOffset + bid*chunkSize;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
-    }
-    ssize_t offset = gridOffset + bid*chunkSize;
-
-    int nelem = min(chunkSize, size-offset);
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            LLprims.send(thisInput+offset, nelem);
+          } else {
+            LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+          }
+        } else if (nextRank == root) {
+          LLprims.recv(thisOutput + offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput + offset, nelem);
+        }
       }
-    } else if (nextRank == root) {
-      LLprims.recv(thisOutput + offset, nelem);
-    } else {
-      LLprims.recvCopySend(thisOutput + offset, nelem);
     }
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+};
 
 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int nextRank = ring->devUserRanks[1];
+      const int root = args->coll.root;
 
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
-    ssize_t offset = gridOffset + bid*chunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+        ssize_t offset = gridOffset + bid*chunkSize;
 
-    int nelem = min(chunkSize, size-offset);
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        LLprims.send(thisInput+offset, nelem);
-      } else {
-        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (rank == root) {
+          if (thisInput == thisOutput) {
+            LLprims.send(thisInput+offset, nelem);
+          } else {
+            LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+          }
+        } else if (nextRank == root) {
+          LLprims.recv(thisOutput + offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput + offset, nelem);
+        }
       }
-    } else if (nextRank == root) {
-      LLprims.recv(thisOutput + offset, nelem);
-    } else {
-      LLprims.recvCopySend(thisOutput + offset, nelem);
     }
-  }
-}
+};
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index a76f4e8bad..265218a228 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -10,6 +10,15 @@
 #include "collectives.h"
 #include "devcomm.h"
 
+
+#if __CUDA_ARCH__ >= 800
+#define COLL_UNROLL 8
+#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree
+#else
+#define COLL_UNROLL 4
+#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
+#endif
+
 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
 // Each thread sets a predicate to true if abort == 1
 // all CTA's threads enter the barrier and do a popc on their predicates being True
@@ -19,12 +28,12 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
   asm ("{");
   asm volatile ("   .reg .pred barr_pred;");
   asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
-  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+  asm volatile ("   bar.red.popc.u32 %0, 0, barr_pred;" : "=r"(popc));
   asm ("}");
   if (popc) { asm volatile ("exit;"); }
 }
 
-typedef void(*ncclKern_t)(struct CollectiveArgs* args);
+typedef void(*ncclKern_t)(struct ncclWorkElem* args);
 extern __device__ ncclKern_t ncclFuncs[];
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
@@ -32,131 +41,143 @@ static __device__ void load_parallel(void* dst, void* src, size_t size, int tid)
   int* s = (int*)src;
   for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
+static __device__ void load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm) {
+  __syncthreads();
+  load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
   // Check whether the last operation was aborted and make sure all threads exit
   int abort = tid == 0 ? *(comm->abortFlag) : 0;
   exitIfAbortBarrier(abort);
-  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
-  __syncthreads();
-  if (tid == 0) hostColl->active = 0;
+  if (tid == 0) hostWork->elems[0].active = 0;
 }
 
-extern __device__ volatile uint64_t* ncclShmem;
+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction {
+  public:
+  __device__ void run(struct ncclWorkElem* args) {}
+};
 
-/* Functions for aggregation case */
-#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
-__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
+struct ncclShmemPtrs {
+  void* srcs[NCCL_MAX_DEV_ARITY+1];
+  void* dsts[NCCL_MAX_DEV_ARITY+1];
+};
+
+struct ncclShmemData {
+  union {
+    volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
+    struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
+  };
+  struct ncclWork localWork;
+};
+
+extern __device__ struct ncclShmemData *ncclShmem;
+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX>
+__device__ void ncclKernel(struct ncclWorkElem first)  {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  __shared__ struct ncclShmemData shmem;
+  ncclShmem = &shmem;
+
+  auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
+
+  struct ncclDevComm* comm = first.comm;
+  struct ncclChannel* channel = comm->channels+bid;
+  struct ncclWorkElem* w = NULL;
+  uint16_t index = first.index;
+
+  /* To optimize for latency, (only) the first operation is passed as argument.*/
+  if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
+
+  while (1) {
+    if (w == NULL) {
+      w = shmem.localWork.elems;
+      load_coll(&shmem.localWork, channel->workFifo+index, tid, comm);
+    }
+    if (tid < w->nThreads) {
+      if (w->funcIndex == FINDEX) {
+        f.run(w);
+      } else {
+        ncclFuncs[w->funcIndex](w);
+      }
+    }
+    index = (index+1) % NCCL_MAX_OPS;
+    if (w->active == 2) {
+      return;
+    }
+    w = NULL;
+  }
 }
 
+// Only generate kernels for SUM
 #if NCCL_OP == 0
-/* Kernels with the first operation inlined */
-#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
-__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
-  int tid = threadIdx.x; \
-  int bid = blockIdx.x; \
-  __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
-  ncclShmem = shmem; \
-  __shared__ struct ncclColl localColl; \
- \
-  struct ncclDevComm* comm = firstColl.args.comm; \
-  struct ncclChannel* channel = comm->channels+bid; \
-  struct ncclColl* c; \
-  if (bid == 0) { \
-    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
-    c = &firstColl; \
-  } else { \
-    c = &localColl; \
-    load_coll(c, channel->collectives+channel->collFifoHead, tid, comm); \
-  } \
-  while (1) { \
-    if (tid < c->args.common.nThreads) { \
-      if (c->funcIndex == fIndex) { \
-        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
-      } else { \
-        ncclFuncs[c->funcIndex](&c->args); \
-      } \
-    } \
-    int nextIndex = c->nextIndex; \
-    if (tid == 0) channel->collFifoHead = nextIndex; \
- \
-    if (c->active == 2) { \
-      return; \
-    } \
- \
-    /* Load next collective operation*/ \
-    c = &localColl; /* for bid 0 */ \
-    load_coll(c, channel->collectives+nextIndex, tid, comm); \
-  } \
+#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
+__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
+  ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex>(first); \
 }
 #else
-#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#define IMPL_COLL_KERN(func, algo, proto, redop, type, fInded)
 #endif
 
-// Only generate inline kernels for LL
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
-  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
+// Examples :     AllReduce, RING, LL,    Sum,   uint8
+#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
+__device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
+  auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
+  f.run(args); \
+}
 
-#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
-  IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
+// Only generate inline kernels for LL
+#define IMPL_COLL4(func, algo, redop, type, ncclType) \
+  IMPL_COLL_FUNC(func, algo, LL,     redop, type) \
+  IMPL_COLL_FUNC(func, algo, LL128,  redop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
+  IMPL_COLL_KERN(func, algo, LL,     redop, type, FUNC_INDEX(ncclFunc##func, nccl##redop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
+
+#define IMPL_COLL3(func, redop, type, ncclType) \
+  IMPL_COLL4(func, TREE,    redop, type, ncclType) \
+  IMPL_COLL4(func, RING,    redop, type, ncclType) \
+  IMPL_COLL4(func, COLLNET, redop, type, ncclType)
 
 #if NCCL_TYPE == 0
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int8_t,   ncclInt8)
 #elif NCCL_TYPE == 1
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint8_t,  ncclUint8)
 #elif NCCL_TYPE == 2
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int32_t,  ncclInt32)
 #elif NCCL_TYPE == 3
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint32_t, ncclUint32)
 #elif NCCL_TYPE == 4
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int64_t,  ncclInt64)
 #elif NCCL_TYPE == 5
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint64_t, ncclUint64)
 #elif NCCL_TYPE == 6
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, half,     ncclFloat16)
 #elif NCCL_TYPE == 7
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, float,    ncclFloat32)
 #elif NCCL_TYPE == 8
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, double,   ncclFloat64)
 #endif
 
 // Reduction define all functions
 #if NCCL_OP == 0
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, sum,  FuncSum,  colln, ncclSum);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum);
 #elif NCCL_OP == 1
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod);
 #elif NCCL_OP == 2
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, min,  FuncMin,  colln, ncclMin);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Min);
 #elif NCCL_OP == 3
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, max,  FuncMax,  colln, ncclMax);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
 #endif
 
-// Copy primitives only define one
 #if NCCL_OP == 0 && NCCL_TYPE == 0
-#define IMPL_COLL_C(collf, colln) \
-  IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+// Copy primitives only define one function for copy
+#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
+
+// Point-to-point primitives only have one function/kernel.
+#define IMPL_COLL_P(func) \
+  IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
+  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, 0);
 #else
-#define IMPL_COLL_C(collf, colln)
+#define IMPL_COLL_C(func)
+#define IMPL_COLL_P(func)
 #endif
 
-#define COLL_UNROLL 4
-
 #endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index aa1e936444..ff466a0dac 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,12 @@
 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 
+template <typename T>
+inline __device__ void loadPtr(void** ptr, T* &v) {
+  asm volatile("ld.volatile.global.u64 %0, [%1];"
+      : "=l"(v) : "l"(ptr));
+}
+
 typedef uint64_t PackType;
 
 // unpack x and y to elements of type T and apply FUNC to each element
@@ -245,28 +251,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
 }
 
-template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
-    const int offset, const int N) {
-  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
-    T val = vFetch(srcs[0]+idx);
-    #pragma unroll
-    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
-    #pragma unroll 1
-    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
+    int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const T* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
+  T* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
+
+  while (offset < Nelem) {
+    T vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
 
     #pragma unroll
-    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
-    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+    for (int i=1; i<MINSRCS; i++) {
+      T vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+    }
+    #pragma unroll
+    for (int i=MINSRCS; i<MAXSRCS; i++) {
+      if (i<nsrcs) {
+        T vals2[UNROLL];
+        for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+        for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+      }
+    }
+
+    // Store
+    #pragma unroll
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll
+    for (int i=MINDSTS; i<MAXDSTS; i++) {
+      if (i<ndsts) {
+        for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+      }
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
   }
 }
 
 template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
-    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
-    const int elemOffset, const int Npack) {
+__device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
+    int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
   const int inc = nw * UNROLL * WARP_SIZE;
   int offset = w * UNROLL * WARP_SIZE + t;
 
@@ -280,25 +315,31 @@ __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw,
     // Load and reduce
     for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
 
+    #pragma unroll
     for (int i=1; i<MINSRCS; i++) {
       Pack128 vals2[UNROLL];
       for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
       for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
     }
-    #pragma unroll 1
-    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
-      Pack128 vals2[UNROLL];
-      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
-      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    #pragma unroll
+    for (int i=MINSRCS; i<MAXSRCS; i++) {
+      if (i<nsrcs) {
+        Pack128 vals2[UNROLL];
+        for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+        for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+      }
     }
 
     // Store
+    #pragma unroll
     for (int i = 0; i < MINDSTS; i++) {
       for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
     }
-    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
-      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    #pragma unroll
+    for (int i=MINDSTS; i<MAXDSTS; i++) {
+      if (i<ndsts) {
+        for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+      }
     }
     for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
     for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
@@ -309,72 +350,65 @@ __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw,
 template <typename T>
 __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
 
-// Try to limit consecutive load/stores to 8.
-// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
-#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+#define PACKELEMS (sizeof(Pack128) / sizeof(T))
 
 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
 __device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    int nsrcs, const T** srcs, int ndsts, T** dsts,
     int N) {
   int Nrem = N;
   if (Nrem <= 0) return;
 
-  int alignDiff = 0;
-  int align = ptrAlign128(srcs[0]);
-  #pragma unroll
-  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  #pragma unroll
-  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-
-  int Npreamble = alignDiff ? Nrem :
-    N < alignof(Pack128) ? N :
-    (alignof(Pack128) - align) % alignof(Pack128);
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  if (Npreamble) {
-    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
-    Nrem -= Npreamble;
-    if (Nrem == 0) return;
-  }
-  int offset = Npreamble;
-
-  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 128-bit alignable.
   int w = tid / WARP_SIZE;       // Warp number
   int nw = nthreads / WARP_SIZE; // Number of warps
   int t = tid % WARP_SIZE;       // Thread (inside the warp)
 
-  const int packFactor = sizeof(Pack128) / sizeof(T);
+  // Check that all is 16B aligned. If not don't use 16B load/stores.
+  int align = 0;
+  #pragma unroll
+  for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
 
-  // stage 2a: main loop
-  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
-      * (AUTOUNROLL * WARP_SIZE); // round down
-  int Nelem2a = Npack2a * packFactor;
+  int offset = 0;
+  if (align == 0) {
+    // fast path: use 128b loads/stores to do the bulk of the work,
+    // assuming the pointers we have are all 128-bit aligned.
 
-  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+    // main loop
+    int Npack = (Nrem / (PACKELEMS*UNROLL*WARP_SIZE)) * (UNROLL*WARP_SIZE); // round down
+    int Nelem = Npack * PACKELEMS;
 
-  Nrem -= Nelem2a;
+    ReduceCopy128bMulti<FUNC, T, UNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
+
+    Nrem -= Nelem;
+    if (Nrem == 0) return;
+    offset += Nelem;
+
+    // slightly less optimized for section when we don't have full unrolling
+    Npack = Nrem / PACKELEMS;
+    Nelem = Npack * PACKELEMS;
+
+    ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
+
+    Nrem -= Nelem;
+    if (Nrem == 0) return;
+    offset += Nelem;
+  }
+
+  // unrolled, by-type (mostly for unaligned buffers)
+  int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
+
+  ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
+
+  Nrem -= Nelem;
   if (Nrem == 0) return;
-  offset += Nelem2a;
+  offset += Nelem;
 
-  // stage 2b: slightly less optimized for section when we don't have full
-  // unrolling
-
-  int Npack2b = Nrem / packFactor;
-  int Nelem2b = Npack2b * packFactor;
-
-  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
-
-  Nrem -= Nelem2b;
-  if (Nrem == 0) return;
-  offset += Nelem2b;
-
-  // stage 2c: tail
-  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+  // no unroll, by type. Should finish what's remaining.
+  ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }
 
 #endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 119cd361e2..553a882bcd 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -8,60 +8,60 @@
 #include "collectives.h"
 #include "common.h"
 
-__device__ volatile uint64_t* ncclShmem;
+__device__ struct ncclShmemData* ncclShmem;
 
-#define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll##LL, op, dtype), \
-  NCCL_COLL_NAME(coll##LL128, op, dtype), \
-  NCCL_COLL_NAME(coll, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, type) \
+  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
+  NCCL_FUNC_NAME(func, algo, LL128,  redop, type), \
+  NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
 
-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##CollNet, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+  NCCL_FUNC5(func, TREE,    redop, type), \
+  NCCL_FUNC5(func, RING,    redop, type), \
+  NCCL_FUNC5(func, COLLNET, redop, type)
 
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
+#define NCCL_FUNCS3A(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, uint8_t), \
+  NCCL_FUNC4(func, redop, int32_t), \
+  NCCL_FUNC4(func, redop, uint32_t), \
+  NCCL_FUNC4(func, redop, int64_t), \
+  NCCL_FUNC4(func, redop, uint64_t), \
+  NCCL_FUNC4(func, redop, half), \
+  NCCL_FUNC4(func, redop, float), \
+  NCCL_FUNC4(func, redop, double)
+#define NCCL_FUNCS3B(func, redop) \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t), \
+  NCCL_FUNC4(func, redop, int8_t)
 
 // Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+  NCCL_FUNCS3A(func, Sum ), \
+  NCCL_FUNCS3A(func, Prod), \
+  NCCL_FUNCS3A(func, Max ), \
+  NCCL_FUNCS3A(func, Min )
+#define NCCL_FUNCS2B(func) \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum)
 
 // Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
-  NCCL_FUNCS2B(ncclBroadcast), \
-  NCCL_FUNCS2A(ncclReduce), \
-  NCCL_FUNCS2B(ncclAllGather), \
-  NCCL_FUNCS2A(ncclReduceScatter), \
-  NCCL_FUNCS2A(ncclAllReduce) }
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
+  NCCL_FUNCS2B(Broadcast), \
+  NCCL_FUNCS2A(Reduce), \
+  NCCL_FUNCS2B(AllGather), \
+  NCCL_FUNCS2A(ReduceScatter), \
+  NCCL_FUNCS2A(AllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
 __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
@@ -69,12 +69,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
 #if __CUDA_ARCH__
-  NCCL_COLL_NAME(ncclSendRecv, copy, i8),
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  NCCL_FUNCS2B(Broadcast),
+  NCCL_FUNCS2A(Reduce),
+  NCCL_FUNCS2B(AllGather),
+  NCCL_FUNCS2A(ReduceScatter),
+  NCCL_FUNCS2A(AllReduce)
 #endif
 };
 
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 12078a81a3..69348db6b0 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -31,63 +31,57 @@
   } \
 } while (0)
 
+#define ROLE_SRC       0x01
+#define ROLE_DST       0x02
+#define ROLE_WAIT_RECV 0x04
+#define ROLE_WAIT_SEND 0x08
+#define ROLE_POST_SEND 0x10
+#define ROLE_POST_RECV 0x20
+
 // Implementation of primitive types
 template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
 class ncclPrimitives {
  private:
   const int tid;
-  const int nthreads;
-  const int wid;
+  int nthreads;
+  int nworkers;
   const int stepSize;
   int nrecv = 0;
   int nsend = 0;
-  struct ncclConnInfo* recvConn = NULL;
-  volatile uint64_t* recvConnHeadPtr = NULL;
-  uint64_t recvConnHead;
-  volatile uint64_t* recvConnTailPtr = NULL;
-  uint64_t recvConnTail;
-  uint64_t recvConnTailCache; // Cache last seen value
+  struct ncclConnInfo* conn = NULL;
+  volatile int* connSizesFifoPtr = NULL;
+  void** connPtrsFifoPtr = NULL;
+  volatile uint64_t* connHeadPtr = NULL;
+  volatile uint64_t* connTailPtr = NULL;
+  uint64_t connTailCache; // Cache last seen value
+  uint64_t connHeadCache; // Cache last seen value
 
-  struct ncclConnInfo* sendConn = NULL;
-  volatile int* sendConnFifoPtr = NULL;
-  volatile uint64_t* sendConnTailPtr = NULL;
-  uint64_t sendConnTail;
-  volatile uint64_t* sendConnHeadPtr = NULL;
-  uint64_t sendConnHead;
-  uint64_t sendConnHeadCache; // Cache last seen value
-
-  uint64_t recvStep[NRECV];
-  uint64_t sendStep[NSEND];
-  const T* recvDirectBuff[NRECV];
-  T* sendDirectBuff[NSEND];
-  const T* recvBuff[NRECV];
-  T* sendBuff[NSEND];
+  int index; // Peer index I'm responsible for
+  int peer = -1;
+  int role = 0;
+  int group;
+  uint64_t step;
+  T* direct = NULL;
+  T* buff;
   struct ncclDevComm* comm;
 
-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
-  inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+  const T** srcs;
+  T** dsts;
 
+  // Don't use barrier 0 as it's used by the final sync
   inline __device__ void barrier() {
-    if (NSEND>NRECV) {
-      asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
-    } else {
-      asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
-    }
+    if (nthreads == WARP_SIZE) __syncwarp();
+    else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
   }
   inline __device__ void subBarrier() {
-    if (NSEND>NRECV) {
-      asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
-    } else {
-      asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
-    }
+    if (nworkers == nthreads) barrier();
+    else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
   }
 
   uint32_t spins = 0;
   uint32_t abort = 0;
 
-  inline __device__ int checkAbort(int i, int send) {
+  inline __device__ int checkAbort() {
     spins++;
     if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
       abort = *(comm->abortFlag);
@@ -96,63 +90,45 @@ class ncclPrimitives {
     return abort;
   }
 
-  inline __device__ void waitSend(int nbytes) {
-    spins = 0;
-    if (sendConnHeadPtr) {
-      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
-        sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(wid, 1)) break;
-      }
-      if (sendConnFifoPtr) {
-        sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
-      }
-      sendConnHead += SLICESTEPS;
-    }
+  template <int DIRECTPTR>
+  inline __device__ T* directPtr(ssize_t directOffset) {
+    return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
   }
 
-  inline __device__ void waitRecv() {
+  template <int DST, int DIRECTSEND>
+  inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
     spins = 0;
-    if (recvConnTailPtr) {
-      while (recvConnTailCache < recvConnTail + SLICESTEPS) {
-        recvConnTailCache = *recvConnTailPtr;
-        if (checkAbort(wid, 0)) break;
-      }
-      recvConnTail += SLICESTEPS;
+    while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
+      connHeadCache = *connHeadPtr;
+      if (checkAbort()) break;
     }
+    if (connSizesFifoPtr) {
+      connSizesFifoPtr[step%NCCL_STEPS] = nbytes;
+    }
+
+    if (connPtrsFifoPtr) loadPtr(connPtrsFifoPtr+step%NCCL_STEPS, dsts[DST+index]);
+    else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
+    step += SLICESTEPS;
   }
 
-  inline __device__ void incRecv(int i) {
-    recvStep[i] += SLICESTEPS;
+  template <int SRC, int DIRECTRECV>
+  inline __device__ void waitRecv(ssize_t directOffset) {
+    spins = 0;
+    while (connTailCache < step + SLICESTEPS) {
+      connTailCache = *connTailPtr;
+      if (checkAbort()) break;
+    }
+    if (connPtrsFifoPtr) loadPtr(connPtrsFifoPtr+step%NCCL_STEPS, srcs[SRC+index]);
+    else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
+    step += SLICESTEPS;
   }
+
   inline __device__ void postRecv() {
-    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
+    *connHeadPtr = step += SLICESTEPS;
   }
 
-  inline __device__ void incSend(int i) {
-    sendStep[i] += SLICESTEPS;
-  }
   inline __device__ void postSend() {
-    if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
-  }
-
-  template <int DIRECTRECV>
-  inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
-    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
-  }
-
-  template <int DIRECTSEND>
-  inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
-    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
-  }
-
-  template <int DIRECTRECV>
-  inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
-    return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
-  }
-
-  template <int DIRECTSEND>
-  inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
-    return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
+    *connTailPtr = step += SLICESTEPS;
   }
 
   template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
@@ -162,135 +138,128 @@ class ncclPrimitives {
     int sliceSize = stepSize*SLICESTEPS;
     int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
 
-    const T* srcs[RECV*NRECV+SRC];
-    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
-    if (RECV) {
-      if (SRC) srcs[1] = recvPtr(0);
-      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
-    }
-
-    T* dsts[SEND*NSEND+DST];
-    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
-    if (SEND) {
-      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
-      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
-    }
-
-    bool syncThread = tid >= nthreads;
-
     #pragma unroll
     for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
       int realSize = max(0, min(dataSize, nelem-offset));
-      if (!syncThread) {
-        if (SEND) waitSend(realSize*sizeof(T));
-        if (RECV) waitRecv();
+      if (tid < nworkers) {
+        if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
+        if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
+        if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
+        if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
         if (realSize > 0) {
           subBarrier();
-          if (DIRECTRECV && recvDirectBuff[0]) {
+          if (DIRECTRECV && srcs[0] == dsts[0]) {
             // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
             if (SEND) {
-              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+              // (1-SEND) is only there to avoid compilation errors in case NSEND=0 (and SEND=0).
+              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, (1-SEND)+NSEND>(tid, nworkers, 1, srcs, nsend, dsts+1, realSize);
             }
           } else {
-            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
           }
         }
       }
       barrier();
-      FOR_SEND(incSend);
-      FOR_RECV(incRecv);
-      if (syncThread) {
-        if (SEND) {
-          if (realSize > 0 && wid == 0) __threadfence_system();
-          __syncwarp();
-          postSend();
-        }
-        if (RECV) postRecv();
-      }
-      srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
-      for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
-      dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
-      for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+      if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
+      __syncwarp();
+      if (SEND && (role & ROLE_POST_SEND)) postSend();
+      if (RECV && (role & ROLE_POST_RECV)) postRecv();
       offset += realSize;
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    recvBuff[i] = (const T*)conn->buffs[NCCL_PROTO_SIMPLE];
-    recvStep[i] = conn->step;
-    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
-    recvDirectBuff[i] = NULL;
-    if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
-      recvDirectBuff[i] = directBuff;
-      if (tid == 0) *conn->ptrExchange = directBuff;
-    }
-    if (wid == i) recvConn = conn;
-    if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
-    nrecv++;
-  }
-  __device__ __forceinline__ void loadRecvSync() {
-    if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
-      recvConnTailPtr = recvConn->tail;
-      recvConnTailCache = *recvConnTailPtr;
-    }
-    if (tid >= nthreads && wid < nrecv) {
-      recvConnHeadPtr = recvConn->head;
-      // Return credits in case we rounded up.
-      *recvConnHeadPtr = recvConnHead;
+  __device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
+    if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
+      conn = &channel->devPeers[peer].recv.conn;
+      step = conn->step;
+      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+      if (role & ROLE_POST_RECV) {
+        connHeadPtr = conn->head;
+        // Return credits in case we rounded up.
+        *connHeadPtr = step;
+      }
+      if (role & ROLE_WAIT_RECV) {
+        buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+          direct = directBuff;
+          *conn->ptrExchange = directBuff;
+        }
+        connTailPtr = conn->tail;
+        connTailCache = *connTailPtr;
+        connPtrsFifoPtr = conn->ptrsFifo;
+      }
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
-    sendBuff[i] = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-    sendStep[i] = conn->step;
-    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
-    sendDirectBuff[i] = NULL;
-    if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
-      void* volatile* ptr = conn->ptrExchange;
-      while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
-      barrier();
-      if (tid == 0) *ptr = NULL;
-    }
-    if (wid == i) sendConn = conn;
-    if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
-    nsend++;
-  }
-  __device__ __forceinline__ void loadSendSync() {
-    if (tid < nsend) {
-      sendConnHeadPtr = sendConn->head;
-      sendConnHeadCache = *sendConnHeadPtr;
-      sendConnFifoPtr = sendConn->fifo;
-    }
-    if (tid >= nthreads && wid<nsend) {
-      sendConnTailPtr = sendConn->tail;
+  __device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
+    if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
+      conn = &channel->devPeers[peer].send.conn;
+      step = conn->step;
+      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+      if (role & ROLE_POST_SEND) {
+        connTailPtr = conn->tail;
+      }
+      if (role & ROLE_WAIT_SEND) {
+        buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+          void* volatile* ptr = conn->ptrExchange;
+          while ((direct = (T*)(*ptr)) == NULL);
+          *ptr = NULL;
+        }
+        connHeadPtr = conn->head;
+        connHeadCache = *connHeadPtr;
+        connSizesFifoPtr = conn->sizesFifo;
+        connPtrsFifoPtr = conn->ptrsFifo;
+      }
     }
   }
 
-  __device__ __forceinline__ void saveRecvSync() {
-    if (tid >= nthreads && wid < nrecv) {
-      recvConn->step = recvConnHead;
-      __threadfence_system();
-    }
-  }
-
-  __device__ __forceinline__ void saveSendSync() {
-    if (tid < nsend) {
-      sendConn->step = sendConnHead;
+  __device__ __forceinline__ void saveSync() {
+    if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
+      conn->step = step;
       __threadfence_system();
     }
   }
 
  public:
   __device__ __forceinline__
-  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
-    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
+  ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
+    : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group) {
+    nthreads = nworkers;
+    // For send operations, we need an extra warp to overlap the threadfence and the copy
+    int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
+    nthreads += postThreads;
+
     // Make sure step is updated before we read it.
     barrier();
 
-    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
-    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
-    loadRecvSync();
-    loadSendSync();
+    for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
+    for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
+
+    #define SYNC_GROUP 8
+    static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
+
+    int g = tid / SYNC_GROUP;
+    int ng = nthreads / SYNC_GROUP;
+    index = tid % SYNC_GROUP;
+
+    if (g == 0) {
+      if (index < nrecv) role |= ROLE_WAIT_RECV;
+      if (index == nrecv) role |= ROLE_SRC;
+    } else if (g == 1) {
+      if (index < nsend) role |= ROLE_WAIT_SEND;
+      if (index == nsend) role |= ROLE_DST;
+    } else if (g == ng - 2) {
+      if (index < nrecv) role |= ROLE_POST_RECV;
+    } else if (g == ng - 1) {
+      if (index < nsend) role |= ROLE_POST_SEND;
+    }
+
+    if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
+    if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
+
+    loadRecvConn(channel, directBuff);
+    loadSendConn(channel);
   }
 
   __device__ __forceinline__ void
@@ -351,8 +320,7 @@ class ncclPrimitives {
 
   __device__ __forceinline__ ~ncclPrimitives() {
     // Save steps for the next operation
-    saveRecvSync();
-    saveSendSync();
+    saveSync();
   }
 };
 
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index 93c6db366a..9e362f9ee5 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -178,7 +178,7 @@ class ncclLLPrimitives {
       sendConnHeadPtr = sendConn->head;
       sendConnHeadCache = *sendConnHeadPtr;
       sendConnHead = sendConn->step;
-      sendConnFifoPtr = sendConn->fifo;
+      sendConnFifoPtr = sendConn->sizesFifo;
     }
   }
 
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index f97b25c9f2..999d0d5de7 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -211,14 +211,14 @@ class ncclLL128Primitives {
     /************************ Send **************************/
     if (SEND) {
       for (int i=1; i<NSEND && i<nsend; i++) {
-        int flag = sendFlag(i);
+        uint64_t flag = sendFlag(i);
         uint64_t* ptr = sendPtr(i)+ll128Offset;
         #pragma unroll
         for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
           store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
         }
       }
-      int flag = sendFlag(0);
+      uint64_t flag = sendFlag(0);
       uint64_t* ptr = sendPtr(0)+ll128Offset;
       #pragma unroll
       for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
@@ -318,10 +318,10 @@ class ncclLL128Primitives {
       sendConnHeadPtr = sendConn->head;
       sendConnHeadCache = *sendConnHeadPtr;
       sendConnHead = sendConn->step;
-      sendConnFifoPtr = sendConn->fifo;
+      sendConnFifoPtr = sendConn->sizesFifo;
     }
     if (tid >= nthreads-WARP_SIZE && wid<nsend) {
-      if (sendConn->fifo) {
+      if (sendConn->sizesFifo) {
         sendConnTailPtr = sendConn->tail;
         sendConnTail = sendConn->step;
       }
@@ -345,7 +345,7 @@ class ncclLL128Primitives {
  public:
   __device__ __forceinline__
   ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
-    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
     // Make sure step is updated before we read it.
     barrier();
 
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index a2caac5fef..66f1bb2ec2 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_R(ncclReduce, ncclCollReduce);
+IMPL_COLL_R(Reduce);
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 115e05e0e3..313209d5f4 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -8,142 +8,145 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = ring->devUserRanks[0];
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads-WARP_SIZE;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = ring->devUserRanks[0];
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*realChunkSize;
-    int nelem = min(realChunkSize, size-offset);
-    if (prevRank == root) {
-      prims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      prims.recvReduceSend(thisInput+offset, nelem);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
+        if (prevRank == root) {
+          prims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
     }
-  }
-}
+};
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = comm->rank;
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = comm->rank;
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t offset = gridOffset + bid*chunkSize;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (prevRank == root) {
+          LLprims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
     }
-    ssize_t offset = gridOffset + bid*chunkSize;
-
-    int nelem = min(chunkSize, size-offset);
-    if (prevRank == root) {
-      LLprims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
+};
 
 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
-  const int rank = comm->rank;
-  const int prevRank = ring->devUserRanks[nranks-1];
-  const int root = args->coll.root;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
+      const int rank = comm->rank;
+      const int prevRank = ring->devUserRanks[nranks-1];
+      const int root = args->coll.root;
 
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
-    ssize_t offset = gridOffset + bid*chunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+        ssize_t offset = gridOffset + bid*chunkSize;
 
-    int nelem = min(chunkSize, size-offset);
-    if (prevRank == root) {
-      LLprims.send(thisInput+offset, nelem);
-    } else if (rank == root) {
-      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
-    } else {
-      LLprims.recvReduceSend(thisInput+offset, nelem);
+        int nelem = min(chunkSize, size-offset);
+        if (prevRank == root) {
+          LLprims.send(thisInput+offset, nelem);
+        } else if (rank == root) {
+          LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
     }
-  }
-}
+};
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 8b45299757..c2c6d42806 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
+IMPL_COLL_R(ReduceScatter);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 52c858cc90..a0d45dc4a5 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -8,186 +8,189 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads-WARP_SIZE;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads-WARP_SIZE;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+      const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+      const ssize_t size = args->coll.count;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
+      ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
-    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(realChunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(realChunkSize, size-chunkOffset);
+        int rankDest;
 
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;
 
-    prims.send(thisInput+offset, nelem);
+        prims.send(thisInput+offset, nelem);
 
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
 
-      prims.recvReduceSend(thisInput+offset, nelem);
+          prims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final result
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
     }
+};
 
-    // step k-1: reduce this buffer and data, which will produce the final result
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
 
-    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
+      ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        if (size-gridOffset < loopSize) {
+          chunkSize = args->coll.lastChunkSize;
+        }
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;
 
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
-  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;
 
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+        LLprims.send(thisInput+offset, nelem);
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->coll.lastChunkSize;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
+
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final
+        // result that we store in this data
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
     }
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
-
-    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
+};
 
 #include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*chunkSize;
-  const ssize_t size = args->coll.count;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int bid = args->coll.bid;
+      const int nChannels = args->coll.nChannels;
+      struct ncclDevComm* comm = args->comm;
+      struct ncclChannel* channel = comm->channels+blockIdx.x;
+      struct ncclRing* ring = &channel->ring;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+      const int nranks = comm->nRanks;
+      const ssize_t loopSize = nChannels*chunkSize;
+      const ssize_t size = args->coll.count;
 
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+      ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+      // Compute pointers
+      const T * __restrict__ thisInput = (const T*)args->sendbuff;
+      T * __restrict__ thisOutput = (T*)args->recvbuff;
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
 
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+        ssize_t chunkOffset = gridOffset + bid*chunkSize;
 
-    /////////////// begin ReduceScatter steps ///////////////
-    ssize_t offset;
-    int nelem = min(chunkSize, size-chunkOffset);
-    int rankDest;
+        /////////////// begin ReduceScatter steps ///////////////
+        ssize_t offset;
+        int nelem = min(chunkSize, size-chunkOffset);
+        int rankDest;
 
-    // step 0: push data to next GPU
-    rankDest = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + rankDest * size;
+        // step 0: push data to next GPU
+        rankDest = ring->devUserRanks[nranks-1];
+        offset = chunkOffset + rankDest * size;
 
-    LLprims.send(thisInput+offset, nelem);
+        LLprims.send(thisInput+offset, nelem);
 
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + rankDest * size;
+        // k-2 steps: reduce and copy to next GPU
+        for (int j=2; j<nranks; ++j) {
+          rankDest = ring->devUserRanks[nranks-j];
+          offset = chunkOffset + rankDest * size;
 
-      LLprims.recvReduceSend(thisInput+offset, nelem);
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+
+        // step k-1: reduce this buffer and data, which will produce the final
+        // result that we store in this data
+        rankDest = ring->devUserRanks[0];
+        offset = chunkOffset + rankDest * size;
+
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+      }
     }
+};
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data
-    rankDest = ring->devUserRanks[0];
-    offset = chunkOffset + rankDest * size;
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
 
-    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
-  }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu
index 34e7adfdf8..59e38b528e 100644
--- a/src/collectives/device/sendrecv.cu
+++ b/src/collectives/device/sendrecv.cu
@@ -8,7 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-#if NCCL_OP == 0 && NCCL_TYPE == 0
-IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
-#endif
+IMPL_COLL_P(SendRecv);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index 7998ba64c0..1cb34f393c 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -8,74 +8,85 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->p2p.nThreads-2*WARP_SIZE;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* firstArgs) {
+      struct ncclWorkElem* args = firstArgs;
+      int tid = threadIdx.x;
+      int group = 0;
+      for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
+        int nThreadsSegment = args->p2p.nThreads;
+        if (nThreadsSegment == 0) return; // Nothing else to do
+        int groupRecv = group;
+        group += 1;
+        int groupSend = group;
+        group += nThreadsSegment > 128 ? 2 : 1;
+        if (tid < nThreadsSegment) {
+          const int nThreads = nThreadsSegment > 128 ? nThreadsSegment-WARP_SIZE : nThreadsSegment;
 
-  // Compute pointers
-  const T* sendbuff = (const T*)args->sendbuff;
-  T* recvbuff = (T*)args->recvbuff;
+          // Compute pointers
+          const T* sendbuff = (const T*)args->sendbuff;
+          T* recvbuff = (T*)args->recvbuff;
+          const ssize_t sendCount = args->p2p.sendCount;
+          const ssize_t recvCount = args->p2p.recvCount;
 
-  if (args->p2p.delta < 0 ) return; // No-op
+          const int delta = args->p2p.delta;
+          if (delta == 0) {
+            if (tid < nThreads && sendbuff != recvbuff) {
+              // local copy : ReduceOrCopyMulti takes an int as number of elements,
+              // so we split it in blocks of 1G elements.
+              int blockSize = 1<<30;
+              for (size_t offset=0; offset<sendCount; offset += blockSize) {
+                size_t remaining = sendCount - offset;
+                if (remaining < blockSize) blockSize = remaining;
+                ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
+                sendbuff += blockSize; recvbuff += blockSize;
+              }
+            }
+          } else {
+            struct ncclDevComm* comm = args->comm;
+            struct ncclChannel* channel = comm->channels+blockIdx.x;
 
-  if (args->p2p.delta == 0) {
-    if (tid < nthreads && sendbuff != recvbuff) {
-      // local copy : ReduceOrCopyMulti takes an int as number of elements,
-      // so we split it in blocks of 1G elements.
-      int blockSize = 1<<30;
-      for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
-        size_t remaining = args->p2p.sendCount - offset;
-        if (remaining < blockSize) blockSize = remaining;
-        ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
-        sendbuff += blockSize; recvbuff += blockSize;
+            const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
+            const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
+
+            int nThreadsSplit = nThreads/2;
+            if ((tid < nThreadsSplit) && recvCount >= 0) {
+              int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
+              int nt = nThreadsSplit;
+              ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
+                prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
+
+              if (recvCount == 0) {
+                prims.recv(recvbuff, 0);
+              } else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
+                int realChunkSize = min(chunkSize, recvCount-offset);
+                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+                int nelem = min(realChunkSize, recvCount-offset);
+                prims.directRecv(recvbuff+offset, offset, nelem);
+              }
+            }
+            if ((tid >= nThreadsSplit) && sendCount >= 0) {
+              int peer = (comm->rank+delta)%comm->nRanks;
+              int nt = nThreads-nThreadsSplit;
+              ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
+                prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
+
+              if (sendCount == 0) {
+                prims.send(sendbuff, 0);
+              } else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
+                int realChunkSize = min(chunkSize, sendCount-offset);
+                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+                int nelem = min(realChunkSize, sendCount-offset);
+                prims.directSend(sendbuff+offset, offset, nelem);
+              }
+            }
+          }
+        }
+        tid -= nThreadsSegment;
+        if (tid < 0) return;
+        args++;
       }
     }
-    return;
-  }
-
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
-
-  int nthreadsSplit = nthreads/2;
-  // We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
-  // receive threads, but then we define all peers to -1 since sender threads don't receive and
-  // receive threads don't send.
-  int peerNone[2] = {-1,-1};
-
-  if (tid < nthreadsSplit + WARP_SIZE ) {
-    const ssize_t sendSize = args->p2p.sendCount;
-    if (sendSize < 0) return;
-
-    int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
-    ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
-      prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*4, channel, comm);
-
-    if (sendSize == 0) {
-      prims.send(sendbuff, 0);
-    } else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
-      int realChunkSize = min(stepSize, sendSize-offset);
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      int nelem = min(realChunkSize, sendSize-offset);
-      prims.directSend(sendbuff+offset, offset, nelem);
-    }
-  } else {
-    const ssize_t recvSize = args->p2p.recvCount;
-    if (recvSize < 0) return;
-
-    int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
-    ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
-      prims(tid-nthreadsSplit-WARP_SIZE, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*4, channel, comm);
-
-    if (recvSize == 0) {
-      prims.recv(recvbuff, 0);
-    } else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
-      int realChunkSize = min(stepSize, recvSize-offset);
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      int nelem = min(realChunkSize, recvSize-offset);
-      prims.directRecv(recvbuff+offset, offset, nelem);
-    }
-  }
-}
+};
diff --git a/src/collectives/reduce.cc b/src/collectives/reduce.cc
index 67f2fae969..86388dfed1 100644
--- a/src/collectives/reduce.cc
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollReduce, "Reduce",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
     REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
   return ncclEnqueueCheck(&info);
diff --git a/src/collectives/reduce_scatter.cc b/src/collectives/reduce_scatter.cc
index 5ad7f5fa13..57c67bf431 100644
--- a/src/collectives/reduce_scatter.cc
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
   return ncclEnqueueCheck(&info);
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
index 2e3287597a..65222a59f7 100644
--- a/src/collectives/sendrecv.cc
+++ b/src/collectives/sendrecv.cc
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollSendRecv, "Send",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncSendRecv, "Send",
     sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
@@ -26,7 +27,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  struct ncclInfo info = { ncclCollSendRecv, "Recv",
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct ncclInfo info = { ncclFuncSendRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
diff --git a/src/debug.cc b/src/debug.cc
index 3b9920191e..25bf37a438 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -127,7 +127,7 @@ void ncclDebugInit() {
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
   if (ncclDebugLevel == -1) ncclDebugInit();
   if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
-  if (ncclDebugLevel < level) return;
+  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
 
   // Gather the rank information. This can take > 1us so we want to make sure
   // we only do it when needed.
@@ -144,11 +144,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   if (level == NCCL_LOG_WARN)
     len = snprintf(buffer, sizeof(buffer),
         "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
+  else if (level == NCCL_LOG_INFO)
     len = snprintf(buffer, sizeof(buffer),
         "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
 #ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+  else if (level == NCCL_LOG_TRACE) {
     auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
     double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
     len = snprintf(buffer, sizeof(buffer),
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 40e8f57107..a427bd982e 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -9,58 +9,58 @@
 #include "coll_net.h"
 
 // Only generate inline kernels for LL
-#define NCCL_FUNC5(coll, op, dtype) \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, dtype) \
+  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
+  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
+  (void*)NCCL_KERN_NAME(func, algo, LL, redop, dtype)
 
-#define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_FUNC5(coll##Tree, op, dtype), \
-  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
-  (void*)NCCL_FUNC5(coll##CollNet, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+  (void*)NCCL_FUNC5(func, TREE,    redop, type), \
+  (void*)NCCL_FUNC5(func, RING,    redop, type), \
+  (void*)NCCL_FUNC5(func, COLLNET, redop, type)
 
 // Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  u8), \
-  (void*)NCCL_FUNC4(coll, op, i32), \
-  (void*)NCCL_FUNC4(coll, op, u32), \
-  (void*)NCCL_FUNC4(coll, op, i64), \
-  (void*)NCCL_FUNC4(coll, op, u64), \
-  (void*)NCCL_FUNC4(coll, op, f16), \
-  (void*)NCCL_FUNC4(coll, op, f32), \
-  (void*)NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8)
+#define NCCL_FUNCS3A(func, redop) \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, uint8_t), \
+  (void*)NCCL_FUNC4(func, redop, int32_t), \
+  (void*)NCCL_FUNC4(func, redop, uint32_t), \
+  (void*)NCCL_FUNC4(func, redop, int64_t), \
+  (void*)NCCL_FUNC4(func, redop, uint64_t), \
+  (void*)NCCL_FUNC4(func, redop, half), \
+  (void*)NCCL_FUNC4(func, redop, float), \
+  (void*)NCCL_FUNC4(func, redop, double)
+#define NCCL_FUNCS3B(func, redop) \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t), \
+  (void*)NCCL_FUNC4(func, redop, int8_t)
 
 // Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum), \
-  NCCL_FUNCS3A(coll, sum)
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum), \
+  NCCL_FUNCS3A(func, Sum)
+#define NCCL_FUNCS2B(func) \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum), \
+  NCCL_FUNCS3B(func, Sum)
 
 // Must be consistent with the ncclFuncSet enum
 static void* const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-  (void*)NCCL_KERN_NAME(ncclSendRecv, copy, i8),
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
+  (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  NCCL_FUNCS2B(Broadcast),
+  NCCL_FUNCS2A(Reduce),
+  NCCL_FUNCS2B(AllGather),
+  NCCL_FUNCS2A(ReduceScatter),
+  NCCL_FUNCS2A(AllReduce)
 };
 
 /*****************************************************************************/
@@ -87,41 +87,57 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
   return ncclSuccess;
 }
 
-ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
+  if (channel->workCount == NCCL_MAX_OPS) {
+    WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
+    return ncclInvalidUsage;
+  }
+  int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
+  struct ncclWork* w = channel->workFifo+opIndex;
+  struct ncclWorkElem* e = w->elems;
+  volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
+  while (activePtr[0] != 0) sched_yield();
+  memset(w, 0, sizeof(struct ncclWork));
+  // Initialize with work elem if provided
+  if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
+  e->active = 1;
+  e->index = opIndex;
+  channel->workFifoTail++;
+  channel->workCount++;
+  if (work) *work = w;
+  return ncclSuccess;
+}
+
+static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
   // Only launch blocks where we have work to do.
   for (int c=0; c<comm->p2pnChannels; c++) {
-    if (comm->channels[c].collCount) params->gridDim.x = c+1;
+    if (comm->channels[c].workCount) params->gridDim.x = c+1;
   }
 
   // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
   for (int c=0; c<params->gridDim.x; c++) {
     struct ncclChannel* channel = comm->channels+c;
-    if (channel->collCount == 0) {
-      int opIndex = channel->collFifoTail;
-      struct ncclColl* c = channel->collectives+opIndex;
-      volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-      while (activePtr[0] != 0) sched_yield();
-
-      c->args.p2p.delta = -1; // no-op
-      c->funcIndex = FUNC_INDEX_P2P;
-      c->args.comm = comm->devComm;
-      c->active = 1;
-      opIndex = (opIndex+1)%NCCL_MAX_OPS;
-      c->nextIndex = opIndex;
-      channel->collFifoTail = opIndex;
-      channel->collCount++;
+    if (channel->workCount == 0) {
+      struct ncclWork* w;
+      NCCLCHECK(getNextOp(channel, &w, NULL));
+      struct ncclWorkElem* e = w->elems;
+      e->comm = comm->devComm;
+      e->funcIndex = FUNC_INDEX_P2P;
+      e->p2p.nThreads = 0;
     }
-    channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
+    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2;
   }
 
   // Find the first operation, choose the kernel accordingly and pass it
   // as the first argument.
-  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
-  memcpy(&comm->args, coll, sizeof(struct ncclColl));
-  // As we pass that coll directly, we can free it immediately.
-  coll->active = 0;
+  struct ncclChannel* c0 = comm->channels;
+  struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
+  struct ncclWorkElem* elem = work->elems;
+  memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
+  // As we inline the first coll directly, we can free it immediately.
+  if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
 
-  params->func = ncclKerns[coll->funcIndex];
+  params->func = ncclKerns[elem->funcIndex];
   return ncclSuccess;
 }
 
@@ -131,7 +147,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
   bool done = false;
   while (done == false) {
     if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many collectives");
+      WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
       return ncclInvalidUsage;
     }
     if (val+1 == comm->intraRanks) {
@@ -151,7 +167,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
   volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
   int val = *ptr;
   if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many collectives");
+    WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
     return ncclInternalError;
   }
   return ncclSuccess;
@@ -222,13 +238,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
   // launch and the ncclProxyStart call could cause a deadlock.
   // Also, starting the proxies after the CUDA launch seems to be better for
   // performance (latency).
+  uint64_t max = 0ULL;
   for (int r=0; r<params->gridDim.x; r++) {
     struct ncclChannel* channel = comm->channels+r;
-    channel->collStart = channel->collFifoTail;
-    channel->collCount = 0;
+    max = std::max(max, channel->workFifoTail);
+    channel->workCount = 0;
+  }
+  for (int r=0; r<comm->p2pnChannels; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->workFifoTail = max;
   }
   params->gridDim.x = params->blockDim.x = 0;
-  comm->lastOpCount = comm->opCount;
+  comm->lastOpCount = max;
   NCCLCHECK(ncclProxyStart(comm));
   return ncclSuccess;
 }
@@ -280,7 +301,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
   TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
 
-  int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
+  int nc = (info->nChannels > 0) ? info->nChannels :
+           (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
   int nt = comm->maxThreads[info->algorithm][info->protocol];
   int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
   while (info->nBytes < nc*nt*threadThreshold) {
@@ -289,6 +311,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
     else break;
   }
   if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
+  if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
   info->nChannels = nc;
   info->nThreads = nt;
   return ncclSuccess;
@@ -296,14 +319,14 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
 
 static ncclResult_t getPatternInfo(struct ncclInfo* info) {
   switch (info->coll) {
-    case ncclCollBroadcast:
+    case ncclFuncBroadcast:
       info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
-    case ncclCollReduce:
+    case ncclFuncReduce:
       info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
-    case ncclCollReduceScatter:
-    case ncclCollAllGather:
+    case ncclFuncReduceScatter:
+    case ncclFuncAllGather:
       info->pattern = ncclPatternRing; break;
-    case ncclCollAllReduce:
+    case ncclFuncAllReduce:
       info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
     default:
       WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
@@ -333,30 +356,22 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
-  coll->args.sendbuff = info->sendbuff;
-  coll->args.recvbuff = info->recvbuff;
-  coll->args.comm = info->comm->devComm;
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
+  work->comm = info->comm->devComm;
 
-  if (info->coll == ncclCollSendRecv) {
-    coll->args.p2p.sendCount = info->sendbytes;
-    coll->args.p2p.recvCount = info->recvbytes;
-    coll->args.p2p.delta = info->delta;
-    coll->funcIndex = FUNC_INDEX_P2P;
-    coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
-    return ncclSuccess;
-  }
   // Set nstepsPerLoop and nchunksPerLoop
   NCCLCHECK(getAlgoInfo(info));
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
-  coll->args.coll.root = info->root;
-  coll->args.coll.count = info->count;
-  coll->args.coll.nChannels = info->nChannels;
-  coll->args.coll.nThreads = info->nThreads;
+  work->sendbuff = info->sendbuff;
+  work->recvbuff = info->recvbuff;
+  work->coll.root = info->root;
+  work->coll.count = info->count;
+  work->coll.nChannels = info->nChannels;
+  work->nThreads = info->nThreads;
 
-  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+  work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
 
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -367,25 +382,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
   if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
     if (info->pattern == ncclPatternTreeUpDown) {
       // Optimize chunkSize / nSteps
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
-      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
     }
     // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
     // Optimize chunkSize / nSteps
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
-    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
-    coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
+    work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
+    work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
     int nNodes = info->comm->nNodes;
     float ppn = info->comm->nRanks / (float)nNodes;
@@ -393,7 +408,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+    work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
   }
 
   // Compute nSteps for proxies
@@ -406,9 +421,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
   proxyArgs->sliceSteps = sliceSteps;
   proxyArgs->chunkSteps = chunkSteps;
   proxyArgs->protocol = info->protocol;
-  proxyArgs->opCount = info->comm->opCount;
   proxyArgs->dtype = info->datatype;
   proxyArgs->redOp = info->op;
+  // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
+  // because some protocols need to transmit more than the total size, plus they sometimes
+  // round up
+  proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
+
   TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
       proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
       nLoops, proxyArgs->nsteps, info->comm);
@@ -427,32 +446,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
 }
 
 ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
-  if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
+  if (info->comm->nRanks == 1) {
     if (info->sendbuff != info->recvbuff)
       CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
     return ncclSuccess;
   }
 
-  struct ncclColl coll;
+  struct ncclWorkElem work;
   struct ncclProxyArgs proxyArgs;
   memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
-  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+  NCCLCHECK(computeColl(info, &work, &proxyArgs));
 
   info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
 
-  int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
+  int nChannels = work.coll.nChannels;
   int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
 
   for (int bid=0; bid<nChannels*nSubChannels; bid++) {
-    int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
-      info->comm->myParams->gridDim.x % info->comm->nChannels;
+    int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
     struct ncclChannel* channel = info->comm->channels+channelId;
 
-    if (channel->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
     // Proxy
     proxyArgs.channel = channel;
     // Adjust pattern for CollNet based on channel index
@@ -460,67 +473,141 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
       info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
     }
 
-    if (info->coll == ncclCollSendRecv) {
-      info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
-      NCCLCHECK(ncclProxySaveP2p(info, channel));
-    } else {
-      NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
-    }
+    if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
     info->comm->myParams->gridDim.x++;
-    int opIndex = channel->collFifoTail;
-    struct ncclColl* c = channel->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (activePtr[0] != 0) sched_yield();
-
-    memcpy(c, &coll, sizeof(struct ncclColl));
-    if (info->coll != ncclCollSendRecv) c->args.coll.bid = bid % coll.args.coll.nChannels;
-
-    c->active = 1;
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    channel->collFifoTail = opIndex;
-    channel->collCount++;
+    work.coll.bid = bid % nChannels;
+    NCCLCHECK(getNextOp(channel, NULL, &work));
   }
-  info->comm->opCount++;
   return ncclSuccess;
 }
 
-// Save p2p operations in comm->p2plist. Operations will be posted to channels
+#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
+#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
+
+ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
+  if (comm->asyncOpCount == 0) {
+    return ncclSuccess;
+  } else if (comm->asyncOpCount == 1) {
+    // No aggregation
+    struct ncclInfo* info = comm->asyncOps;
+    info->nChannels = 0;
+    NCCLCHECK(ncclSaveKernel(info));
+  } else {
+    // Aggregation
+    size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks;  // scale channel size based on nranks as latency increases
+    // Reduce the per-channel size if we cannot fully utilize the channels
+    while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
+    for (int c = 0; c < comm->asyncOpCount; c++) {
+      struct ncclInfo* info = comm->asyncOps+c;
+      info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
+      NCCLCHECK(ncclSaveKernel(info));
+    }
+  }
+  // Reset counters
+  comm->asyncOpCount = 0;
+  comm->asyncTotalSize = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
+  ncclComm_t comm = info->comm;
+  if (comm->asyncOpCount >= NCCL_MAX_OPS) {
+    WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
+    return ncclInvalidUsage;
+  }
+  memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
+  comm->asyncOpCount++;
+  comm->asyncTotalSize += info->nBytes;
+  return ncclSuccess;
+}
+
+// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
 // during ncclGroupEnd()
-ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
+static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
-  struct ncclP2Plist* p2plist = &comm->p2plist;
   int peer = info->root;
-  p2plist->count++;
   ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  if (info->recvbuff == NULL) {
+  if (info->opName[0] == 'S') { // Send
     if (peer != comm->rank) {
       int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
         if (comm->channels[channelId].peers[peer].send.connected == 0) {
-          p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
+          comm->connectSend[peer] |= (1<<channelId);
+          comm->connect = 1;
         }
       }
     }
-    p2plist->peerlist[info->root].sendbytes = nBytes;
-    p2plist->peerlist[info->root].sendbuff = info->sendbuff;
+    NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
+    comm->p2pSendCount++;
   } else {
     if (peer != comm->rank) {
       int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
         if (comm->channels[channelId].peers[peer].recv.connected == 0) {
-          p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
+          comm->connectRecv[peer] |= (1<<channelId);
+          comm->connect = 1;
         }
       }
     }
-    p2plist->peerlist[info->root].recvbytes = nBytes;
-    p2plist->peerlist[info->root].recvbuff = info->recvbuff;
+    NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
+    comm->p2pRecvCount++;
   }
   return ncclSuccess;
 }
 
+static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
+  for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
+    if (work->elems[s].p2p.nThreads == 0) return s;
+  }
+  return -1;
+}
+
+static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
+  struct ncclWorkElem* elem = work->elems+s;
+  elem->comm = info->comm->devComm;
+  elem->funcIndex = FUNC_INDEX_P2P;
+  elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
+  elem->sendbuff = info->sendbuff;
+  elem->recvbuff = info->recvbuff;
+  elem->p2p.sendCount = info->sendbytes;
+  elem->p2p.recvCount = info->recvbytes;
+  elem->p2p.delta = info->delta;
+  const int nsegments = s+1;
+  int nThreads = 512;
+  while (nsegments*nThreads > 512) nThreads /= 2;
+  if (nThreads >= 128) nThreads += WARP_SIZE;
+  for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
+  int channelId = info->channelId;
+  struct ncclChannel* channel = info->comm->channels+channelId;
+
+  // Try to reuse last p2p operation if not full yet
+  int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
+  struct ncclWork* w = channel->workFifo+opIndex;
+  int segment = -1;
+  if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
+    // Try to pack more segments into a single operation
+    segment = getSegment(info, w);
+  }
+  if (segment == -1) {
+    NCCLCHECK(getNextOp(channel, &w, NULL));
+    segment = 0;
+  }
+
+  NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
+  NCCLCHECK(saveP2pOp(info, w, segment));
+  info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
   // Launch asynchronously if needed
   if (ncclAsyncMode()) {
@@ -542,10 +629,10 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
-    if (info->coll == ncclCollSendRecv) { //p2p stored separately
+    if (info->coll == ncclFuncSendRecv) { //p2p stored separately
       NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
     } else {
-      NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
+      NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
     }
 end:
     if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index dd9f9f0bde..a0f1265c84 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -23,14 +23,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
   for (int c=0; c<nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     channel->ring.prev = channel->ring.next = -1;
-    channel->treeUp.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
-    channel->treeDn.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
-    channel->collTreeUp.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
-    channel->collTreeDn.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
+    channel->tree.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
+    channel->collTree.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
 
     int* ringIntra = ringGraph->intra+c*localRanks;
     int* treeIntra = treeGraph->intra+c*localRanks;
@@ -44,33 +40,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
         channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
       }
       if (treeIntra[i] == rank) {
-        int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
-        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+        int parentIndex = 0;
+        int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+        int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
 
-        // Tree loop always flows in the same direction. Other trees are symmetric, i.e.
-        // up/down go in reverse directions
-        int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
-
-        // Down tree is common
-        topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
-        topoRanks->treeDnSend[c] = treeIntra[sendIndex];
-        channel->treeDn.up       = treeIntra[prev];
-        channel->treeDn.down[0]  = treeIntra[next];
-        // Up tree depends on the pattern
-        topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
-        topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
-        channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
-        channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
+        topoRanks->treeToParent[c] = treeIntra[parentIndex];
+        topoRanks->treeToChild0[c] = treeIntra[child0Index];
+        topoRanks->treeToChild1[c] = treeIntra[child1Index];
+        channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
+        channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
       }
       if (collNetIntra[i] == rank) {
         int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
 
-        // CollTrees are always symmetric, i.e.
-        // up/down go in reverse directions
-        channel->collTreeDn.up      = collNetIntra[prev];
-        channel->collTreeDn.down[0] = collNetIntra[next];
-        channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
-        channel->collTreeUp.up      = channel->collTreeDn.up;
+        channel->collTree.up      = collNetIntra[prev];
+        channel->collTree.down[0] = collNetIntra[next];
       }
     }
     topoRanks->ringPrev[c] = channel->ring.prev;
@@ -120,72 +104,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
  return ncclSuccess;
 }
 
-static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
-  if (u0 != -1) tree0->up = indexes[u0];
-  if (u1 != -1) tree1->up = indexes[u1];
+static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
+  if (u == -1) return ncclSuccess;
+  tree->up = indexes[u];
   return ncclSuccess;
 }
 
-static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
+static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
+  if (d == -1) return ncclSuccess;
   int x = 0;
-  if (down[x] >= 0) x++;
-  if (down[x] >= 0) {
-    WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
+  while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
+  if (x == NCCL_MAX_TREE_ARITY) {
+    WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
     return ncclInternalError;
   }
-  if (r0 != -1) down[x++] = indexes[r0];
-  if (r1 != -1) down[x++] = indexes[r1];
+  tree->down[x] = indexes[d];
   return ncclSuccess;
 }
 
-static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
-  NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
-  NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
-  return ncclSuccess;
-}
-
-static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
-  if (tree->down[0] == upRank) tree->down[0] = -1;
-  if (rank == upRank) tree->up = -1;
-  return ncclSuccess;
-}
-
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
   const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-  int* indexesSend, *indexesRecv;
-  NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
-  NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
+  int* ranksToParent, *ranksToChild0, *ranksToChild1;
+  NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
+  NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
+  NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
 
   // Compute tree depth. Not an exact value but a good approximation in most
   // cases
   int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
 
-  int u0, d0_0, d0_1, u1, d1_0, d1_1;
-  NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+  int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+  NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
   for (int c=0; c<nChannels; c++) {
      struct ncclChannel* channel0 = comm->channels+c;
      struct ncclChannel* channel1 = channel0+nChannels;
-     NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
-     NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
-     NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
-     NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
-     int root = indexesSend[node];
-     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
-     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
-     NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
-     NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
-     NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
-     NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
-     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
-     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
-     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c,           channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
-     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
-     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c,           channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
-     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
-     channel0->treeUp.depth = channel1->treeUp.depth = depth;
+     NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
+     if (comm->rank == ranksToParent[node]) {
+       NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
+       NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
+     }
+     if (comm->rank == ranksToChild0[node]) {
+       NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
+       NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
+     }
+     if (comm->rank == ranksToChild1[node]) {
+       NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
+       NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
+     }
+     if (comm->rank == ranksToParent[node] ||
+         comm->rank == ranksToChild0[node] ||
+         comm->rank == ranksToChild1[node]) {
+       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+     }
+     channel0->tree.depth = channel1->tree.depth = depth;
   }
-  free(indexesSend);
-  free(indexesRecv);
+  free(ranksToParent);
+  free(ranksToChild0);
+  free(ranksToChild1);
   return ncclSuccess;
 }
 
@@ -198,13 +176,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     struct ncclChannel* channel = comm->channels+c;
     // Set root of collTree to id nranks
     if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
-      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+      channel->collTree.up = nranks;
     }
     if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+      channel->collTree.down[0] = -1;
     }
-    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
+    channel->collTree.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
   }
   int recvIndex = 0;  // recv GPU index is always 0
   int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
@@ -212,13 +190,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
     // Set root of collTree to id nranks
     if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
-      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+      channel->collTree.up = nranks;
     }
     if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+      channel->collTree.down[0] = -1;
     }
-    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
+    channel->collTree.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
   }
   return ncclSuccess;
 }
@@ -253,35 +231,33 @@ int ncclMaxNchannels() {
   return maxNchannels;
 }
 
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings) {
   // Gather data from all ranks
-  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
+  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
   int nranks = comm->nRanks;
   int nChannels = comm->nChannels;
   NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
   for (int i=0; i<nranks; i++) {
     for (int c=0; c<nChannels;c++) {
       ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
       ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
       ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
       ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
-      treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
-      treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
-      treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
-      treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
+      treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
+      treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
+      treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
     }
   }
 
   // Connect rings and trees. This should also duplicate the channels.
   NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
-  NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
+  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
 
   // Duplicate ringPrev/ringNext for ncclBuildRing
   memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -308,10 +284,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
   free(ringSend);
   free(ringPrev);
   free(ringNext);
-  free(treeUpRecv);
-  free(treeUpSend);
-  free(treeDnRecv);
-  free(treeDnSend);
+  free(treeToParent);
+  free(treeToChild0);
+  free(treeToChild1);
 
   return ncclSuccess;
 }
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index b7118747ce..c7525e60db 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -60,7 +60,12 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
         struct ncclTopoLinkList* remPath;
         NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
         float width = std::min(path->width, link->width);
-        if (remPath->width < width) {
+
+        // allow routing through a GPU only as 1 hop
+        if (node != baseNode && node->type == GPU &&
+            (link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
+
+        if ((remPath->width == 0 || remPath->count > path->count) && remPath->width < width) {
           // Find reverse link
           for (int l=0; l<remNode->nlinks; l++) {
             if (remNode->links[l].remNode == node) {
@@ -80,24 +85,20 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
 
           // Start with path type = link type. PATH and LINK types are supposed to match.
           // Don't consider LINK_NET as we only care about the NIC->GPU path.
-          int type = link->type == LINK_NET ? 0 : link->type;
+          int type = link->type == LINK_NET ? LINK_LOC : link->type;
           // Differentiate between one and multiple PCI switches
-          if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+          if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
           // Consider a path going through the CPU as PATH_PHB
           if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
-          // Ignore Power CPU in an NVLink path
-          if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
-              link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+          // Set 1 hop NVLink as NVB
+          if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
 
           remPath->type = std::max(path->type, type);
 
           // Add to the list for the next iteration if not already in the list
-          // Disallow GPUs as intermediate steps for now
-          if (remNode->type != GPU) {
-            int i;
-            for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
-            if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
-          }
+          int i;
+          for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+          if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
         }
       }
     }
@@ -217,7 +218,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
     if (l == -1) {
       char* str = getenv(levelEnv);
       if (str) {
-        for (int i=0; i<PATH_NET; i++) {
+        for (int i=0; i<=PATH_SYS; i++) {
           if (strcmp(str, topoPathTypeStr[i]) == 0) {
             l = i;
             break;
@@ -239,9 +240,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 }
 
 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
   *p2p = 0;
-  *read = 0;
+  if (read) *read = 0;
+  if (intermediateRank) *intermediateRank = -1;
 
   // Get GPUs from topology
   int g1, g2;
@@ -251,7 +253,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     // GPU not found, we can't use p2p.
     return ncclSuccess;
   }
+
+
+  // Set intermediate GPU rank, if routing through an intermediate GPU.
   struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+  if (path->count == 2) {
+    struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
+    if (intermediateNode->type == GPU && intermediateRank) {
+      *intermediateRank = intermediateNode->gpu.rank;
+    }
+  }
 
   // In general, use P2P whenever we can.
   int p2pLevel = PATH_SYS;
@@ -280,7 +291,7 @@ compare:
   if (path->type == PATH_NVL) {
     struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
     // Enable P2P Read for Ampere/NVLink only
-    if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
+    if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
   }
 
   return ncclSuccess;
@@ -355,8 +366,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
 
     // Update path when we don't want to / can't use GPU Direct P2P
     for (int p=0; p<system->nodes[GPU].count; p++) {
-      int p2p, read;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
+      int p2p;
+      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
@@ -464,8 +475,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
     // Local rank
     path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
     if (path->type == PATH_NVL) {
-      int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
-      double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
       *nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
     } else {
       *nChannels = 2;
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
index 5aacbb5913..53130d1290 100644
--- a/src/graph/rings.cc
+++ b/src/graph/rings.cc
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
 
 ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
   for (int r=0; r<nrings; r++) {
-    char prefix[30];
+    char prefix[40];
     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
     dumpLine(prev+r*nranks, nranks, prefix);
     sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
diff --git a/src/graph/search.cc b/src/graph/search.cc
index cb5292114b..57c66e7664 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -22,8 +22,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
   }
   return maxWidth;
 }
+static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+  float nvlinkWidth = 0.0, pciWidth = 0.0;
+  for (int l=0; l<gpu->nlinks; l++) {
+    struct ncclTopoLink* link = gpu->links+l;
+    if (link->type == LINK_NVL) nvlinkWidth += link->width;
+    if (link->type == LINK_PCI) pciWidth = link->width;
+  }
+  return std::max(pciWidth, nvlinkWidth);
+}
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   system->maxWidth = 0.0;
+  system->totalWidth = 0.0;
   int inter = system->nodes[NET].count;
   if (inter == 0 && system->nodes[GPU].count == 1) {
     system->maxWidth = LOC_WIDTH;
@@ -32,6 +42,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   for (int g=0; g<system->nodes[GPU].count; g++) {
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
+    system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
   }
   return ncclSuccess;
 }
@@ -290,7 +301,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG
     return ncclSuccess;
   }
   // 3. Less hops (but not at the price of going cross NICs)
-  if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
+  if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
   return ncclSuccess;
 }
 
@@ -326,11 +337,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
         if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
         if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+
+        // Balanced Tree : count half of the bandwidth on first two GPUs
+        int nextBackToNet = -1;
+        float speedInterSave = graph->speedInter;
+        if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
+          // Count half of the bandwidth on each of the first two GPUs
+          if (step == 0) nextBackToNet = 1;
+          else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
+          graph->speedInter /= 2;
+        }
+
         NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+        graph->speedInter = speedInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
+
+          if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
           NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
+          graph->speedInter = speedInterSave;
         }
       }
     }
@@ -460,13 +486,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
 ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
   if (system->nodes[NET].count) {
     if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
-    else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
-    else *backToNet = 1;
-    if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
-    else *backToFirstRank = -1;
+    else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
+    else *backToNet = 0;
+    *backToFirstRank = -1;
   } else {
     *backToNet = -1;
-    if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
     else *backToFirstRank = -1;
   }
   return ncclSuccess;
@@ -503,7 +528,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
 /* User defined graph from XML file */
 /************************************/
 
-struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
+struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
 ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int* inter = graph->inter+2*c;
@@ -623,7 +648,7 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
   return ncclSuccess;
 }
 
-float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDS (sizeof(speedArray)/sizeof(float))
 
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
@@ -651,11 +676,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
+  // SPLIT_TREE works better on older archs.
+  int ccMin;
+  NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
+  if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
 
   // First try crossnic, then decrease speed and finally increase speedIntra.
-  tmpGraph.pattern = graph->pattern;
   int pass = 1;
   int speedIndex = 0;
   while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
@@ -670,7 +699,7 @@ search:
 
   NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
   for (int c=0; c<graph->nChannels; c++) {
     printf("%2d : ", c);
     for (int g=0; g<ngpus; g++) {
@@ -680,7 +709,8 @@ search:
   }
 #endif
   // Optimal solution, stop here
-  if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
+  if (time == -1) goto done;
+  if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
 
   if (pass == 1) {
     // First pass, we don't have a solution yet ; try other options
@@ -694,7 +724,7 @@ search:
 
     if (time != -1) globalTimeout += time;
     else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
-    if (globalTimeout < 0) goto done;
+    if (globalTimeout < 0 && graph->nChannels) goto done;
 
     int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
     if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
@@ -709,10 +739,6 @@ search:
     tmpGraph.typeInter = PATH_PIX;
 
     // Try a simpler tree
-    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
-      tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
-      goto search;
-    }
     if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
       tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
       goto search;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index ed4bd2322f..3e395c52d1 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -20,8 +20,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI", "",    "",    "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -215,7 +215,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 }
 
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
-  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
   char line[1024];
   for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
   INFO(NCCL_GRAPH, "==========================================");
@@ -441,7 +441,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
       }
     }
     if (remote) {
-      int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
       NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
       if (remote->type != GPU) {
         NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
@@ -521,6 +521,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
       struct ncclXmlNode* node;
       NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
       if (node == NULL) continue;
+      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
       NCCLCHECK(xmlSetAttrInt(node, "rank", r));
       NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
     }
@@ -535,6 +536,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
       NCCLCHECK(collNetGetProperties(n, &props));
       struct ncclXmlNode* netNode;
       NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
       NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
       NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
       NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -552,6 +554,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(ncclNetGetProperties(n, &props));
     struct ncclXmlNode* netNode;
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
     NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -560,6 +563,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
   }
 
+  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
+  NCCLCHECK(ncclTopoTrimXml(xml));
+
   xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
   if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
     INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
@@ -668,3 +674,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
   }
   return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[NET].count;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
+  if (system->nodes[GPU].count == 0) return ncclInternalError;
+  int min, max;
+  min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
+  for (int g=1; g<system->nodes[GPU].count; g++) {
+    min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+    max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+  }
+  if (ccMin) *ccMin = min;
+  if (ccMax) *ccMax = max;
+  return ncclSuccess;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 950cff8fda..a12bb2d1d6 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -12,8 +12,10 @@
 #include <sched.h>
 
 #define LOC_WIDTH 5000.0
-#define PASCAL_NVLINK_WIDTH 18.0
-#define VOLTA_NVLINK_WIDTH 21.0
+#define SM60_NVLINK_WIDTH 18.0
+#define SM70_NVLINK_WIDTH 21.0
+#define SM80_NVLINK_WIDTH 21.0
+#define SM86_NVLINK_WIDTH 12.0
 #define PCI_WIDTH 12.0           // PCI Gen3 x16
 #define QPI_WIDTH 6.0
 #define SKL_QPI_WIDTH 9.0
@@ -38,20 +40,21 @@ extern const char* topoNodeTypeStr[];
 // We want link types and path types to match as much as possible
 #define LINK_LOC 0
 #define LINK_NVL 1
-#define LINK_PCI 2
-// Skipping 3 for PATH_PXB
-// Skipping 4 for PATH_PHB
-#define LINK_SYS 5
-#define LINK_NET 6
+// Skipping 2 for PATH_NVB
+#define LINK_PCI 3
+// Skipping 4 for PATH_PXB
+// Skipping 5 for PATH_PHB
+#define LINK_SYS 6
+#define LINK_NET 7
 extern const char* topoLinkTypeStr[];
 
 #define PATH_LOC 0
 #define PATH_NVL 1
-#define PATH_PIX 2
-#define PATH_PXB 3
-#define PATH_PHB 4
-#define PATH_SYS 5
-#define PATH_NET 6
+#define PATH_NVB 2
+#define PATH_PIX 3
+#define PATH_PXB 4
+#define PATH_PHB 5
+#define PATH_SYS 6
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -117,6 +120,7 @@ struct ncclTopoNodeSet {
 struct ncclTopoSystem {
   struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
   float maxWidth;
+  float totalWidth;
 };
 
 ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
@@ -132,6 +136,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
 ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
 ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
 
+ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
+
 static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
   *index = -1;
   for (int i=0; i<system->nodes[type].count; i++) {
@@ -154,4 +160,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
   return ncclInternalError;
 }
 
+// Returns NVLink speed in GB/s
+static float ncclTopoNVLinkSpeed(int cudaCompCap) {
+  return
+    cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
+    cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
+    cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
+    cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
+    SM80_NVLINK_WIDTH;
+}
 #endif
diff --git a/src/graph/trees.cc b/src/graph/trees.cc
index 722e61bce2..8e1e2ae852 100644
--- a/src/graph/trees.cc
+++ b/src/graph/trees.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -28,7 +28,7 @@
  *    / \     / \     /  \     \
  *   1   3   5   7   9   11    13
  */
-ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
   int up, down0, down1;
   int bit;
   for (bit=1; bit<nranks; bit<<=1) {
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
 
   if (rank == 0) {
     *u = -1;
-    *d0 = nranks > 1 ? bit >> 1 : -1;
-    *d1 = -1;
+    *d0 = -1;
+    // Child rank is > 0 so it has to be our child 1, not 0.
+    *d1 = nranks > 1 ? bit >> 1 : -1;
     return ncclSuccess;
   }
 
   up = (rank ^ bit) | (bit << 1);
+  // if smaller than the parent, we are his first child, otherwise we're his second
   if (up >= nranks) up = (rank ^ bit);
+  *parentChildType = (rank < up) ? 0 : 1;
   *u = up;
 
   int lowbit = bit >> 1;
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
 }
 
 /* Build a double binary tree. Take the previous tree for the first tree.
- * For the second tree, we use a mirror tree (if nranks is odd)
+ * For the second tree, we use a mirror tree (if nranks is even)
  *
- *                 8---------0---------5
- *          ______/ \______      _____/ \______
- *         4               12   1              9
- *       /   \            /      \           /   \
- *     2       6       10          3       7      10
- *    / \     / \     /  \        / \     / \    /  \
- *   1   3   5   7   9   11      2   4   6   8  11  12
+ * 0---------------8                   3----------------11
+ *          ______/ \                 / \______
+ *         4         \               /         7
+ *       /   \        \             /        /   \
+ *     2       6       10         1        5      9
+ *    / \     / \     /  \       / \      / \    / \
+ *   1   3   5   7   9   11     0   2    4   6  8   10
  *
- * or shift it by one rank (if nranks is even)
+ * or shift it by one rank (if nranks is odd).
  *
- *                 8---------0--------------9
- *          ______/ \                ______/ \
- *         4         \              5         \
- *       /   \        \           /   \        \
- *     2       6       10       3       7       11
- *    / \     / \     /  \     / \     / \     /  \
- *   1   3   5   7   9   11   2   4   6   8   10   1
+ * 0---------------8            1---------------9
+ *          ______/ \______              ______/ \______
+ *         4               12           5                0
+ *       /   \            /           /   \            /
+ *     2       6       10           3       7       11
+ *    / \     / \     /  \         / \     / \     /  \
+ *   1   3   5   7   9   11       2   4   6   8  10   12
  */
-ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
   // First tree ... use a btree
-  ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+  ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
   // Second tree ... mirror or shift
-  if (nranks % 2 == 0) {
+  if (nranks % 2 == 1) {
     // shift
     int shiftrank = (rank-1+nranks) % nranks;
     int u, d0, d1;
-    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
     *s1 = u == -1 ? -1 : (u+1) % nranks;
     *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
     *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
   } else {
     // mirror
     int u, d0, d1;
-    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
     *s1 = u == -1 ? -1 : nranks-1-u;
     *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
     *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 62f50ef8b8..42a4dde887 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -62,77 +62,86 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
 static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .52, 1.2, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ {  .5, 1.2, 4.0 } },
+  { /* Tree (LL/LL128/Simple)*/ { .52, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ {  .5, 1.2, 4.0 } },
   /* PCI */
   { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 5.5 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 50 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
 };
 
 // LL128 max BW (per channel) for the different collectives
-// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
-static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
+// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
+static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
+static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
+static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
-  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
+  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
   comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
     getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
 
-  if (comm->nRanks <= 1) return ncclSuccess;
+  int nNodes = comm->nNodes;
+  int nRanks = comm->nRanks;
+  if (nRanks <= 1) return ncclSuccess;
 
   int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
-  float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  int index2 = nNodes <= 2 ? nNodes-1 : 2;
+  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
+  int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
+  double llMaxBw = llMaxBws[index1][index2];
+  double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
+  float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+
   struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
 
   for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
-    int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
-      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
-      comm->nRanks;
-    int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
-      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
-      comm->nNodes;
+    int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
+      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
+      nRanks;
+    int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
+      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
+      nNodes;
 
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
+      if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
         float busBw = graphs[a]->nChannels * speed;
 
         // Various model refinements
         if (compCap80) busBw = std::min(busBw, 235.0f);
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= (comm->nNodes > 1 || coll == ncclCollAllReduce || coll == ncclCollReduce) ? 1.0/4.0 : 1.0/3.0;
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
-        double maxTreeBw = comm->nNodes > 2 ?
-          compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 :
-          compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0;
-        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, maxTreeBw);
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.8;
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
+        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
         if (a == NCCL_ALGO_COLLNET) busBw *= .9;
         if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
         if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0;  // CollNet does not support LL128
 
         // Convert bus BW to algorithm BW
-        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
+        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
         comm->bandwidths[coll][a][p] = busBw * ratio;
 
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
         float interLat = hwLat[NCCL_HW_NET][a][p];
-        if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
+        if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
         if (a == NCCL_ALGO_RING) {
           float lat = hwLat[hw[a]][a][p];
-          if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
+          if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
             if (ringGraph->sameChannels) {
               comm->latencies[coll][a][p] += lat;
             } else {
@@ -144,10 +153,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           }
         } else if (a == NCCL_ALGO_TREE) {
           comm->latencies[coll][a][p] +=
-            2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+            2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
         } else {
           comm->latencies[coll][a][p] +=
-            2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
+            2 * (nRanks/nNodes-1) * intraLat + interLat;
         }
       }
     }
@@ -168,6 +177,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
     NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
   }
+  // Disable CollNet if it is not supported
+  if (comm->collNetSupport == 0) {
+    algoEnable[NCCL_ALGO_COLLNET] = 0;
+    // If user has hard set NCCL_ALGO=COLLNET, ignore it
+    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
+      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
+      if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
+    }
+  }
 
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     int pEnable = protoEnable[p];
@@ -178,7 +196,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
     // Only disable algo for Allreduce since others only have one
-    if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
   }
 
   if (comm->rank == 0) {
@@ -214,7 +232,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
     comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
   }
-  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
+  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
 
   // Override defaults with user env
   char* str = getenv("NCCL_THREAD_THRESHOLDS");
@@ -243,11 +261,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 }
 
 // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
-// factor is not ideal but works quite well. Powers of two, 64 B to 128MB.
-static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
-  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .4,  .4,  .5,  .6,  .7,  .8,  .9, 1.0, 1.0, 1.0 },
-  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .7,  .6,  .6,  .6,  .5,  .6,  .6,  .7,  .7,  .8,  .9,  .9, 1.0 },
-  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .5,  .5,  .6,  .6,  .7,  .8,  .9 }
+// factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
+  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .4,  .4,  .5,  .6,  .7,  .8,  .9, 1.0, 1.0, 1.0, 1.0 },
+  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .7,  .6,  .6,  .6,  .5,  .6,  .6,  .7,  .7,  .8,  .9,  .9, .92, .92 },
+  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
 
 ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
@@ -257,9 +275,10 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
     *time = -1.0; return ncclSuccess;
   }
   int logSize = log2i(info->nBytes>>6);
-  if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
+  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
   if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
-      && info->coll == ncclCollAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
+      && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
   *time = lat + (info->nBytes) / (1000 * bw);
   return ncclSuccess;
 }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index cc91b925e7..b2232c2b66 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -559,7 +559,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
   NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
   if (index == -1) {
     if (nvmlDev == NULL) {
-      WARN("No NVML, trying to use CUDA instead");
       const char* busId;
       NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
       if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
@@ -647,6 +646,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
         char* path;
         NCCLCHECK(getPciPath(busId, &path));
         NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+        free(path);
       }
     }
   }
@@ -658,10 +658,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
   struct ncclXmlNode* node;
   NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
   NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-  nvmlDevice_t nvmlDev;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  nvmlDevice_t nvmlDev = NULL;
+  static int nvmlInit = 0;
+  if (nvmlInit == 0) {
+    nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
+  }
+  if (nvmlInit == 1) {
+    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  }
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
   return ncclSuccess;
 }
@@ -704,12 +708,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
     for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
     char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
     strcpy(busId, pciSysPath+offset+1);
-    NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
-    if (parent == NULL) {
-      NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
-      NCCLCHECK(xmlSetAttr(parent, "busid", busId));
-      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
-    }
+    NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
+    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
   } else {
     // Virtual NIC, no PCI device, attach to first CPU
     NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
@@ -728,6 +728,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, "keep", &str));
+  if (str && strcmp(str, "1") == 0) {
+    NCCLCHECK(xmlUnsetAttr(node, "keep"));
+  } else {
+    // Copy nSubs and subs as they could change as we trim recursively.
+    struct ncclXmlNode* subs[MAX_SUBS];
+    int nSubs = node->nSubs;
+    memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
+    for (int s=0; s<nSubs; s++) {
+      NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
+    }
+    if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
+  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
+  return ncclSuccess;
+}
+
 /**************************************************/
 /* Parser rules for the user-defined graph search */
 /**************************************************/
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 22e016f3bc..9a617af5db 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -8,7 +8,7 @@
 #define XML_H_
 
 // A few constraints to make the implementation easy
-#define MAX_STR_LEN 256
+#define MAX_STR_LEN 255
 #define MAX_ATTR_COUNT 16
 #define MAX_SUBS 32
 #define MAX_NODES 1024
@@ -19,10 +19,10 @@
 #define NODE_TYPE_SINGLE 3
 
 struct ncclXmlNode {
-  char name[MAX_STR_LEN];
+  char name[MAX_STR_LEN+1];
   struct {
-    char key[MAX_STR_LEN];
-    char value[MAX_STR_LEN];
+    char key[MAX_STR_LEN+1];
+    char value[MAX_STR_LEN+1];
   } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
   int nAttrs;
   int type;
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
 ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
 ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
 
+/* Remove unneeded parts */
+ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
+
 /**************/
 /* XML Struct */
 /* Functions  */
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
   *index = -1;
   const int nAttrs = node->nAttrs;
   for (int a=0; a<nAttrs; a++) {
-    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
+    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
       *index = a;
       return ncclSuccess;
     }
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
   return ncclSuccess;
 }
 
@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
   return ncclSuccess;
 }
 
@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) return ncclSuccess;
+  for (int i=index+1; i<node->nAttrs; i++) {
+    strcpy(node->attrs[i-1].key, node->attrs[i].key);
+    strcpy(node->attrs[i-1].value, node->attrs[i].value);
+  }
+  node->nAttrs--;
   return ncclSuccess;
 }
 
@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
   s->parent = parent;
   if (parent) parent->subs[parent->nSubs++] = s;
   strncpy(s->name, subName, MAX_STR_LEN);
+  s->name[MAX_STR_LEN] = '\0';
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
+  node->type = NODE_TYPE_NONE;
+  struct ncclXmlNode* parent = node->parent;
+  if (parent == NULL) return ncclSuccess;
+  int shift = 0;
+  for (int s=0; s<parent->nSubs; s++) {
+    if (parent->subs[s] == node) shift = 1;
+    else if (shift) parent->subs[s-1] = parent->subs[s];
+  }
+  parent->nSubs--;
   return ncclSuccess;
 }
 
diff --git a/src/group.cc b/src/group.cc
index 5ce4901d15..78a74b65c1 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -34,7 +34,6 @@ struct ncclInitArgs {
 };
 struct ncclCollArgs {
   ncclComm_t comm;
-  int connect;
 };
 
 enum ncclAsyncFuncType {
@@ -109,6 +108,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
 
 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   if (ncclGroupMode == 0) {
     memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
   }
@@ -117,7 +117,7 @@ ncclResult_t ncclGroupStart() {
 }
 
 static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
-  struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
+  struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
     sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
     1, 1 };
   info.delta = delta;
@@ -125,26 +125,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
   info.sendbytes = sendbytes;
   info.recvbytes = recvbytes;
   if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
-  NCCLCHECK(ncclSaveKernel(&info));
+  NCCLCHECK(ncclSaveP2pKernel(&info));
   return ncclSuccess;
 }
 
 void* ncclAsyncThreadPreconnect(void* args_) {
   struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  CUDACHECKTHREAD(cudaSetDevice(args->coll.comm->cudaDev));
-  for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
-    struct ncclComm* comm = args->coll.comm;
-    struct ncclChannel* channel = comm->channels+c;
-    struct ncclP2PConnect* connect = &comm->p2plist.connect;
-    NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
-    connect->nrecv[c] = 0;
-    connect->nsend[c] = 0;
-  }
+  struct ncclComm* comm = args->coll.comm;
+  CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev));
+  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
   return args;
 }
 
+static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
+  size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
+  int nChannels = minChannels;
+  while (size > maxSize && nChannels <= maxChannels/2) {
+    nChannels *= 2;
+    size = DIVUP(totalSize, nChannels);
+  }
+  ALIGN_SIZE(size, minSize);
+  return size;
+}
+
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   if (ncclGroupMode == 0) {
     WARN("ncclGroupEnd: not in a group call.");
     return ncclInvalidUsage;
@@ -185,29 +191,21 @@ ncclResult_t ncclGroupEnd() {
 
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
-      if (p2plist->count != 0) {
-        struct ncclComm* comm = args->coll.comm;
-        args->coll.connect = 0;
-        for (int c=0; c<comm->p2pnChannels; c++)
-          args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
-        if (args->coll.connect) {
-          pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-        }
-      }
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
+      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
     }
   }
 
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
       int err = pthread_join(ncclGroupThreads[i], NULL);
       if (err != 0) {
         WARN("Error waiting for pthread_join : %s\n", strerror(errno));
         return ncclSystemError;
       }
       NCCLCHECKGOTO(args->ret, ret, end);
+      args->coll.comm->connect = 0;
     }
   }
 
@@ -217,48 +215,83 @@ ncclResult_t ncclGroupEnd() {
       struct ncclComm* comm = args->coll.comm;
       int rank = comm->rank;
       int nRanks = comm->nRanks;
-      struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
-      if (p2plist->count) {
-        for (int delta=0; delta<nRanks; delta++) {
+      struct ncclP2Plist* p2pSends = comm->p2pSends;
+      struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
+
+      // Compute how much to split operations
+      // Natural step size matching buffer steps.
+      ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+      // Try to use all channels
+      int nChannelsMax = comm->p2pnChannelsPerPeer;
+      int nChannelsMin = nChannelsMax;
+      // Try to use all channels, but one channel per operation.
+      while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
+      // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
+      while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+
+      while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
+        // schedule delta 0, +1, -1, +2, -2, ...
+        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
+        for (int d=0; d<=nRanks/4; d++) {
+          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
+          int index = 0;
+          int delta = deltas[index];
+sched_delta:
           uint32_t from = (rank+nRanks-delta)%nRanks;
           uint32_t to = (rank+delta)%nRanks;
+          struct ncclP2Pinfo* recv = p2pRecvs[from].head;
+          struct ncclP2Pinfo* send = p2pSends[to].head;
+          if (recv != NULL || send != NULL) {
+            ssize_t totRecvBytes = -1, totSendBytes = -1;
+            if (recv != NULL) totRecvBytes = recv->nbytes;
+            if (send != NULL) totSendBytes = send->nbytes;
+            ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+            ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
 
-          // Compute how much to split operations
-          // Natural step size matching buffer steps.
-          ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
-          // Split each operation on p2pnChannelsPerPeer max.
-          ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
-          ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
-          recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
-          sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
-
-          ssize_t sendOffset = 0;
-          ssize_t recvOffset = 0;
-          int remaining = 1;
-          int chunk = 0;
-          while (remaining) {
-            int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
-            remaining = 0;
-            ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
-            ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
-            if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
-            if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
-            if (sendbytes >= 0 || recvbytes >= 0) {
-              NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
-                    recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
-                    sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
+            ssize_t sendOffset = 0;
+            ssize_t recvOffset = 0;
+            int sendRemaining = 1, recvRemaining = 1;
+            int chunk = 0;
+            do {
+              int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
+              ssize_t recvbytes = totRecvBytes-recvOffset;
+              ssize_t sendbytes = totSendBytes-sendOffset;
+              if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
+              if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+              if (sendbytes >= 0 || recvbytes >= 0) {
+                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
+                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
+                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
+              }
+              recvOffset += recvChunkSize;
+              sendOffset += sendChunkSize;
+              chunk++;
+            } while (sendRemaining || recvRemaining);
+            if (recv) {
+              NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
+              comm->p2pRecvCount--;
             }
-            recvOffset += recvChunkSize;
-            sendOffset += sendChunkSize;
-            chunk++;
+            if (send) {
+              NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
+              comm->p2pSendCount--;
+            }
+          }
+          index++;
+          if (index == 1 && deltas[1] == deltas[0]) index++;
+          if (index == 2 && deltas[2] == deltas[0]) index++;
+          if (index == 3 && deltas[3] == deltas[2]) index++;
+          if (index == 3 && deltas[3] == deltas[1]) index++;
+          if (index < 4) {
+            delta = deltas[index];
+            goto sched_delta;
           }
         }
-        p2plist->count = 0;
       }
     }
   }
 
   /* Collectives are done in three steps :
+   * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
    * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
    * 2. Barrier Wait. No CUDA call is permitted
    * 3. Enqueue Events. CUDA event wait/enqueue.
@@ -267,6 +300,13 @@ ncclResult_t ncclGroupEnd() {
    * prevent some ranks from launching their network threads, which would
    * prevent the NCCL call from completing, blocking the cudaFree call.
    */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      ncclComm_t comm = args->coll.comm;
+      NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
+    }
+  }
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
     if (args->funcType == ASYNC_FUNC_COLL) {
@@ -303,32 +343,28 @@ group_cleanup:
         *args->init.newcomm = NULL;
       } else {
         struct ncclComm* comm = args->coll.comm;
-        for (int c=0; c<comm->p2pnChannels; c++) {
-          struct ncclChannel* channel = comm->channels+c;
-          for (int i=0; i<channel->collCount; i++) {
-            channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+        // Reset aggregation counters
+        comm->asyncOpCount = 0;
+        comm->asyncTotalSize = 0;
+        // Dequeue p2p lists
+        if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
+          struct ncclP2Plist* p2pSends = comm->p2pSends;
+          struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
+          for (int peer=0; peer<comm->nRanks; peer++) {
+            while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
+            while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
           }
-          channel->collFifoTail = channel->collStart;
-          channel->collCount = 0;
+          comm->p2pSendCount = comm->p2pRecvCount = 0;
         }
-        /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
+        /* Free all proxy ops in state->nextOps */
         struct ncclProxyState* state = &comm->proxyState;
-        struct ncclProxyArgs *op, *start;
-        pthread_mutex_lock(&state->mutex);
-        op = start = state->ops;
-        while (op) {
-          if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
-          struct ncclProxyArgs* peerOp = op->nextPeer;
-          while (peerOp) {
-            if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
-            peerOp = peerOp->nextPeer;
-          }
-          op = op->next;
-          if (op == start) break;
+	pthread_mutex_lock(&state->poolMutex);
+	for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
+          op->next = state->pool;
+          state->pool = op;
         }
-        comm->opCount = comm->lastOpCount;
-        pthread_cond_signal(&state->cond);
-        pthread_mutex_unlock(&state->mutex);
+	pthread_mutex_unlock(&state->poolMutex);
+        state->nextOps = NULL;
 
         comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
         comm->userStreamSet = false;
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index a7d6be965e..dbe4320d49 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr);
+ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
 #endif
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index 3278560d9a..0d17b76036 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
 static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
 static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
   NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
diff --git a/src/include/collectives.h b/src/include/collectives.h
index f854364ce4..9b9022e390 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -8,55 +8,58 @@
 #define NCCL_COLLECTIVES_H_
 
 #define FUNC_INDEX_P2P 0
-#define FUNC_INDEX(coll, redop, dtype, al, pr) (1+(((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
+#define FUNC_INDEX(func, redop, ncclType, al, pr) (1+(((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
 
-#define NCCL_COLL_NAME(coll, op, dtype) \
-  coll##_##op##_##dtype
+#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
+  ncclFunction_##func##_##algo##_##proto##_##redop##_##type
 
-#define NCCL_KERN_NAME(coll, op, dtype) \
-  coll##Kernel_##op##_##dtype
+#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
+  ncclKernel_##func##_##algo##_##proto##_##redop##_##type
+
+#define NCCL_IMPL_NAME(func, algo, proto) \
+  nccl##func##algo##proto
 
 /* Declare all collective operations */
-#define DECL_COLL5(coll, op, dtype) \
-  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+#define DECL5(func, algo, proto, redop, type) \
+  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem c); \
 
-#define DECL_COLL4(coll, op, dtype) \
-  DECL_COLL5(coll, op, dtype) \
-  DECL_COLL5(coll##LL, op, dtype) \
-  DECL_COLL5(coll##LL128, op, dtype)
+#define DECL4(func, algo, redop, type) \
+  DECL5(func, algo, SIMPLE, redop, type) \
+  DECL5(func, algo, LL,     redop, type) \
+  DECL5(func, algo, LL128,  redop, type)
 
-#define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##Ring, op, dtype) \
-  DECL_COLL4(coll##Tree, op, dtype) \
-  DECL_COLL4(coll##CollNet, op, dtype)
+#define DECL3(func, redop, type) \
+  DECL4(func, RING,    redop, type) \
+  DECL4(func, TREE,    redop, type) \
+  DECL4(func, COLLNET, redop, type)
 
-#define DECL_COLL2(coll, op) \
-  DECL_COLL3(coll, op, i8) \
-  DECL_COLL3(coll, op, u8) \
-  DECL_COLL3(coll, op, i32) \
-  DECL_COLL3(coll, op, u32) \
-  DECL_COLL3(coll, op, i64) \
-  DECL_COLL3(coll, op, u64) \
-  DECL_COLL3(coll, op, f16) \
-  DECL_COLL3(coll, op, f32) \
-  DECL_COLL3(coll, op, f64)
+#define DECL2(func, redop) \
+  DECL3(func, redop, int8_t) \
+  DECL3(func, redop, uint8_t) \
+  DECL3(func, redop, int32_t) \
+  DECL3(func, redop, uint32_t) \
+  DECL3(func, redop, int64_t) \
+  DECL3(func, redop, uint64_t) \
+  DECL3(func, redop, half) \
+  DECL3(func, redop, float) \
+  DECL3(func, redop, double)
 
-#define DECL_COLL(coll) \
-  DECL_COLL2(coll, sum) \
-  DECL_COLL2(coll, prod) \
-  DECL_COLL2(coll, min) \
-  DECL_COLL2(coll, max)
+#define DECL(func) \
+  DECL2(func, Sum) \
+  DECL2(func, Prod) \
+  DECL2(func, Min) \
+  DECL2(func, Max)
 
-#define DECL_ALL_COLLS \
-  DECL_COLL2(ncclBroadcast, copy) \
-  DECL_COLL(ncclReduce) \
-  DECL_COLL2(ncclAllGather, copy) \
-  DECL_COLL(ncclReduceScatter) \
-  DECL_COLL(ncclAllReduce) \
-  DECL_COLL5(ncclSendRecv,copy,i8) \
+#define DECL_ALL \
+  DECL2(Broadcast, Sum) \
+  DECL(Reduce) \
+  DECL2(AllGather, Sum) \
+  DECL(ReduceScatter) \
+  DECL(AllReduce) \
+  DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
 
-DECL_ALL_COLLS
+DECL_ALL
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
diff --git a/src/include/comm.h b/src/include/comm.h
index 8a44747f21..56116e0070 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -48,8 +48,8 @@ struct ncclRecvMem {
     struct {
       uint64_t tail;
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
       int sizesFifo[NCCL_STEPS];
+      void* ptrsFifo[NCCL_STEPS];
     };
     char pad4[MEM_ALIGN];
   };
@@ -63,6 +63,10 @@ struct ncclComm {
   struct ncclTopoSystem* topo;
 
   void* bootstrap;
+  // Bitmasks for ncclTransportP2pSetup
+  int connect;
+  uint32_t* connectSend;
+  uint32_t* connectRecv;
 
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
@@ -127,7 +131,7 @@ struct ncclComm {
   int* intraCudaDevs;
   int* intraCGMode; // Whether we can use CUDA9 CGMD or not
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
+  struct ncclWorkElem args;
   void* argsptr;
 
   // Global proxy thread
@@ -136,8 +140,17 @@ struct ncclComm {
 
   // Whether this communicator uses collNet
   int collNetSupport;
+
+  // Store info of async operations
+  struct ncclInfo* asyncOps;
+  int asyncOpCount;
+  size_t asyncTotalSize;
+
   //list of async p2p operation queued in a group semantics
-  struct ncclP2Plist p2plist;
+  struct ncclP2Plist* p2pSends;
+  struct ncclP2Plist* p2pRecvs;
+  int p2pSendCount;
+  int p2pRecvCount;
 };
 
 #endif
diff --git a/src/include/core.h b/src/include/core.h
index 0435d9b660..2283134415 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -55,5 +55,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
 #include "alloc.h"
 #include "utils.h"
 #include "param.h"
+#include "nvtx.h"
 
 #endif // end include guard
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index 40c159410e..ec55cbc54a 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
 
 #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
 
-ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
+static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
   uint32_t cpumasks[CPU_SET_N_U32];
   int m = CPU_SET_N_U32-1;
   cpumasks[m] = 0;
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
   int c = 0;
   uint8_t* m8 = (uint8_t*)mask;
   for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 2be7bbaca3..9870117a04 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -12,7 +12,7 @@
 #include <stdint.h>
 
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollSendRecv} ncclFunc_t;
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 
 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
@@ -47,8 +47,9 @@ union ncclLLFifoLine {
 
 #define WARP_SIZE 32
 #define MAXCHANNELS 32
-#define NCCL_MAX_NTHREADS 512
-#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_MAX_NTHREADS 640
+#define NCCL_SIMPLE_MAX_NTHREADS 512
+#define NCCL_LL_MAX_NTHREADS 512
 #define NCCL_LL_LINES_PER_THREAD 8
 #ifdef TEST_LL_CLEANUP
 #define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
@@ -85,9 +86,11 @@ struct ncclConnInfo {
   uint64_t *head;     // Local for send, remote for recv
 
   int direct;         // Direct communication
+  int shared;         // Buffers are shared
   void **ptrExchange; // Pointer exchange for direct communication
 
-  int *fifo;          // Size fifo for proxy
+  int *sizesFifo;     // Sizes fifo from GPU to proxy
+  void* *ptrsFifo;      // Buffer fifo from proxy to GPU
 
   uint64_t step;      // Keep where we are
   uint64_t llLastCleaning;
@@ -129,60 +132,52 @@ struct ncclPeer {
 
 struct ncclDevComm;
 
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclDevComm* comm;
+#define NCCL_MAX_WORK_ELEMENTS 8
+#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
+
+/* ncclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclWorkElem. */
+struct ncclWorkElem {
+  // Header
+  struct ncclDevComm* comm;
+  uint16_t nThreads;
+  uint16_t funcIndex;
+  uint16_t index;
+  uint16_t active;
 
-  // local and remote input, output, and buffer
   const void * sendbuff;
   void * recvbuff;
 
-  // Op-specific fields. Make sure the common part stays the
-  // same on all structs of the union
+  // Op-specific fields.
   union {
     struct {
-      uint16_t nThreads;
-    } common;
-    struct {
-      uint16_t nThreads;
-      uint8_t bid;
-      uint8_t nChannels;
-      uint32_t root;
       size_t count;
       size_t lastChunkSize;
+      uint32_t root;
+      uint8_t bid;
+      uint8_t nChannels;
     } coll;
     struct {
-      uint16_t nThreads;
-      uint16_t unused;
-      int32_t delta;
       size_t sendCount;
       size_t recvCount;
+      int32_t delta;
+      uint16_t nThreads;
     } p2p;
+    uint64_t align[4];
   };
 };
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
+struct ncclWork {
+  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
 };
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
 
 struct ncclChannel {
   union {
     struct {
       struct ncclRing ring;
-      struct ncclTree treeUp;
-      struct ncclTree treeDn;
-      struct ncclTree collTreeUp;
-      struct ncclTree collTreeDn;
+      struct ncclTree tree;
+      struct ncclTree collTree;
 
       int id;
 
@@ -191,11 +186,9 @@ struct ncclChannel {
       struct ncclPeer* devPeers;
 
       // Operation list for aggregation
-      struct ncclColl* collectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
+      struct ncclWork* workFifo;
+      int workCount;
+      uint64_t workFifoTail; // Only used by CPU
     };
     int data[0x80];
   };
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index a7e6e50395..2c2ab1fe78 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
 ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
 ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
 ncclResult_t ncclSaveKernel(struct ncclInfo* info);
+ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
+ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
 
 #endif // End include guard
diff --git a/src/include/graph.h b/src/include/graph.h
index 70117d5e91..a4dba5c14e 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
 
 // Set CPU affinity
@@ -43,15 +43,16 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
 #define NCCL_TOPO_CPU_TYPE_BDW 1
 #define NCCL_TOPO_CPU_TYPE_SKL 2
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
+ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 
 #define NCCL_TOPO_MAX_NODES 256
 
 // Init search. Needs to be done before calling ncclTopoCompute
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 
-#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
-#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
-#define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
+#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
+#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 struct ncclTopoGraph {
   // Input / output
@@ -82,17 +83,16 @@ struct ncclTopoRanks {
   int ringSend[MAXCHANNELS];
   int ringPrev[MAXCHANNELS];
   int ringNext[MAXCHANNELS];
-  int treeUpRecv[MAXCHANNELS];
-  int treeUpSend[MAXCHANNELS];
-  int treeDnRecv[MAXCHANNELS];
-  int treeDnSend[MAXCHANNELS];
+  int treeToParent[MAXCHANNELS];
+  int treeToChild0[MAXCHANNELS];
+  int treeToChild1[MAXCHANNELS];
 };
 
 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
     struct ncclTopoRanks* topoRanks);
 
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
     struct ncclTopoRanks** allTopoRanks, int* rings);
 
 ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index fd19f81050..8c016dc76f 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -15,6 +15,9 @@
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
 
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 8
+
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
 
@@ -29,9 +32,9 @@ typedef struct {
   int speed;      // Port speed in Mbps.
   int port;       // Port number.
   int maxComms;   // Maximum number of comms we can create
-}ncclNetProperties_v3_t;
+}ncclNetProperties_v4_t;
 
-typedef ncclNetProperties_v3_t ncclNetProperties_t;
+typedef ncclNetProperties_v4_t ncclNetProperties_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -41,7 +44,7 @@ typedef struct {
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -62,7 +65,7 @@ typedef struct {
   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
-  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
   // Test whether a request is complete. If size is not NULL, it returns the
   // number of bytes sent/received.
   ncclResult_t (*test)(void* request, int* done, int* size);
@@ -70,11 +73,11 @@ typedef struct {
   ncclResult_t (*closeSend)(void* sendComm);
   ncclResult_t (*closeRecv)(void* recvComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v3_t;
+} ncclNet_v4_t;
 
-typedef ncclNet_v3_t ncclNet_t;
+typedef ncclNet_v4_t ncclNet_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
 
 typedef struct {
   // Name of the collective network (mainly for logs)
@@ -85,7 +88,7 @@ typedef struct {
   // If ndev returns 0, all other functions might be set to NULL.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create connections.
@@ -105,17 +108,17 @@ typedef struct {
       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
-  ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
   // Test whether a request is complete. If size is not NULL, it returns the
   // number of bytes sent/received.
   ncclResult_t (*test)(void* request, int* done, int* size);
   // Close and free collective comm objects
   ncclResult_t (*closeColl)(void* collComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v3_t;
+} ncclCollNet_v4_t;
 
-typedef ncclCollNet_v3_t ncclCollNet_t;
+typedef ncclCollNet_v4_t ncclCollNet_t;
 
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
 
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index bc81965695..244215eb55 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -24,7 +24,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
 static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 01bbb7f024..21ee82eaf8 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
   NVMLCHECK(nvmlDeviceGetIndex(device, index));
   return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
-  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
   NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
   return ncclSuccess;
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
   NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
   return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
-  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
   NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
   return ncclSuccess;
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                    nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
 ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
 
 #endif // NVML_DIRECT
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
new file mode 100644
index 0000000000..77961262ea
--- /dev/null
+++ b/src/include/nvtx.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVTX_H_
+#define NCCL_NVTX_H_
+
+#include "nvtx3.hpp"
+
+struct nccl_domain{static constexpr char const* name{"NCCL"};};
+
+#endif
diff --git a/src/include/nvtx3.hpp b/src/include/nvtx3.hpp
new file mode 100644
index 0000000000..1e993732c0
--- /dev/null
+++ b/src/include/nvtx3.hpp
@@ -0,0 +1,2268 @@
+/*
+ *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/* Temporary helper #defines, #undef'ed at end of header */
+#define NVTX3_CPP_VERSION_MAJOR 1
+#define NVTX3_CPP_VERSION_MINOR 0
+
+/* This section handles the decision of whether to provide unversioned symbols.
+ * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are
+ * not provided, and explicit-version symbols such as nvtx3::v1::thread_range
+ * and NVTX3_V1_FUNC_RANGE must be used.  By default, the first #include of this
+ * header will define the unversioned symbols such as nvtx3::thread_range and
+ * NVTX3_FUNC_RANGE.  Subsequently including a different major version of this
+ * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error
+ * since the symbols would conflict.  Subsequently including of a different
+ * minor version within the same major version is allowed. Functionality of
+ * minor versions is cumulative, regardless of include order.
+ * 
+ * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions
+ * to coexist without problems within a translation unit, the recommended best
+ * practice for instrumenting header-based libraries with NVTX C++ Wrappers is
+ * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp,
+ * #undef it afterward, and only use explicit-version symbols.  This is not
+ * necessary in common cases, such as instrumenting a standalone application, or
+ * static/shared libraries in .cpp files or headers private to those projects.
+ */
+/* clang-format off */
+#if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION)
+    /* Define macro used by all definitions in this header to indicate the
+     * unversioned symbols should be defined in addition to the versioned ones.
+     */
+    #define NVTX3_INLINE_THIS_VERSION
+
+    #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR)
+        /* First occurrence of this header in the translation unit.  Define macros
+         * indicating which version shall be used for unversioned symbols.
+         */
+
+        /**
+         * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols
+         *
+         * Breaking changes may occur between major versions, and different major versions
+         * cannot provide unversioned symbols in the same translation unit (.cpp file).
+         * 
+         * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined.
+         *
+         * Not to be confused with the version number of the NVTX core library.
+         */
+        #define NVTX3_CPP_INLINED_VERSION_MAJOR 1  // NVTX3_CPP_VERSION_MAJOR
+
+        /**
+         * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols
+         *
+         * No breaking changes occur between minor versions -- minor version changes within
+         * a major version are purely additive.
+         *
+         * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined.
+         *
+         * Not to be confused with the version number of the NVTX core library.
+         */
+        #define NVTX3_CPP_INLINED_VERSION_MINOR 0  // NVTX3_CPP_VERSION_MINOR
+    #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR
+        /* Unsupported case -- cannot define unversioned symbols for different major versions
+         * in the same translation unit.
+         */
+        #error \
+          "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both.  Only one major version can enable unversioned symbols in a .cpp file.  To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file."
+    #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \
+      (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR)
+        /* An older minor version of the same major version already defined unversioned
+         * symbols.  The new features provided in this header will be inlined 
+         * redefine the minor version macro to this header's version.
+         */
+        #undef NVTX3_CPP_INLINED_VERSION_MINOR
+        #define NVTX3_CPP_INLINED_VERSION_MINOR 0  // NVTX3_CPP_VERSION_MINOR
+        // else, already have this version or newer, nothing to do
+    #endif
+#endif
+/* clang-format on */
+
+#include <nvtx3/nvToolsExt.h>
+
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+/**
+ * @file nvtx3.hpp
+ *
+ * @brief Provides C++ constructs making the NVTX library safer and easier to
+ * use with zero overhead.
+ */
+
+/**
+ * \mainpage
+ * \tableofcontents
+ *
+ * \section QUICK_START Quick Start
+ *
+ * To add NVTX ranges to your code, use the `nvtx3::thread_range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function(){
+ *    // Begins a NVTX range with the messsage "some_function"
+ *    // The range ends when some_function() returns and `r` is destroyed
+ *    nvtx3::thread_range r{"some_function"};
+ *
+ *    for(int i = 0; i < 6; ++i){
+ *       nvtx3::thread_range loop{"loop range"};
+ *       std::this_thread::sleep_for(std::chrono::seconds{1});
+ *    }
+ * } // Range ends when `r` is destroyed
+ * \endcode
+ *
+ * The example code above generates the following timeline view in Nsight
+ * Systems:
+ *
+ * \image html
+ * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
+ *
+ * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
+ * ranges to your code that automatically use the name of the enclosing function
+ * as the range's message.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function(){
+ *    // Creates a range with a message "some_function" that ends when the
+ * enclosing
+ *    // function returns
+ *    NVTX3_FUNC_RANGE();
+ *    ...
+ * }
+ * \endcode
+ *
+ *
+ * \section Overview
+ *
+ * The NVTX library provides a set of functions for users to annotate their code
+ * to aid in performance profiling and optimization. These annotations provide
+ * information to tools like Nsight Systems to improve visualization of
+ * application timelines.
+ *
+ * \ref RANGES are one of the most commonly used NVTX constructs for annotating
+ * a span of time. For example, imagine a user wanted to see every time a
+ * function, `my_function`, is called and how long it takes to execute. This can
+ * be accomplished with an NVTX range created on the entry to the function and
+ * terminated on return from `my_function` using the push/pop C APIs:
+ *
+ * ```
+ * void my_function(...){
+ *    nvtxRangePushA("my_function"); // Begins NVTX range
+ *    // do work
+ *    nvtxRangePop(); // Ends NVTX range
+ * }
+ * ```
+ *
+ * One of the challenges with using the NVTX C API is that it requires manually
+ * terminating the end of the range with `nvtxRangePop`. This can be challenging
+ * if `my_function()` has multiple returns or can throw exceptions as it
+ * requires calling `nvtxRangePop()` before all possible return points.
+ *
+ * NVTX++ solves this inconvenience through the "RAII" technique by providing a
+ * `nvtx3::thread_range` class that begins a range at construction and ends the
+ * range on destruction. The above example then becomes:
+ *
+ * ```
+ * void my_function(...){
+ *    nvtx3::thread_range r{"my_function"}; // Begins NVTX range
+ *    // do work
+ * } // Range ends on exit from `my_function` when `r` is destroyed
+ * ```
+ *
+ * The range object `r` is deterministically destroyed whenever `my_function`
+ * returns---ending the NVTX range without manual intervention. For more
+ * information, see \ref RANGES and `nvtx3::domain_thread_range`.
+ *
+ * Another inconvenience of the NVTX C APIs are the several constructs where the
+ * user is expected to initialize an object at the beginning of an application
+ * and reuse that object throughout the lifetime of the application. For example
+ * Domains, Categories, and Registered messages.
+ *
+ * Example:
+ * ```
+ * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain");
+ * // Reuse `D` throughout the rest of the application
+ * ```
+ *
+ * This can be problematic if the user application or library does not have an
+ * explicit initialization function called before all other functions to
+ * ensure that these long-lived objects are initialized before being used.
+ *
+ * NVTX++ makes use of the "construct on first use" technique to alleviate this
+ * inconvenience. In short, a function local static object is constructed upon
+ * the first invocation of a function and returns a reference to that object on
+ * all future invocations. See the documentation for
+ * `nvtx3::registered_string`, `nvtx3::domain`, `nvtx3::named_category`,  and
+ * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more
+ * information.
+ *
+ * Using construct on first use, the above example becomes:
+ * ```
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // The first invocation of `domain::get` for the type `my_domain` will
+ * // construct a `nvtx3::domain` object and return a reference to it. Future
+ * // invocations simply return a reference.
+ * nvtx3::domain const& D = nvtx3::domain::get<my_domain>();
+ * ```
+ * For more information about NVTX and how it can be used, see
+ * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and
+ * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+ * for more information.
+ *
+ * \section RANGES Ranges
+ *
+ * Ranges are used to describe a span of time during the execution of an
+ * application. Common examples are using ranges to annotate the time it takes
+ * to execute a function or an iteration of a loop.
+ *
+ * NVTX++ uses RAII to automate the generation of ranges that are tied to the
+ * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard
+ * Template Library.
+ *
+ * \subsection THREAD_RANGE Thread Range
+ *
+ * `nvtx3::domain_thread_range` is a class that begins a range upon construction
+ * and ends the range at destruction. This is one of the most commonly used
+ * constructs in NVTX++ and is useful for annotating spans of time on a
+ * particular thread. These ranges can be nested to arbitrary depths.
+ *
+ * `nvtx3::thread_range` is an alias for a `nvtx3::domain_thread_range` in the
+ * global NVTX domain. For more information about Domains, see \ref DOMAINS.
+ *
+ * Various attributes of a range can be configured constructing a
+ * `nvtx3::domain_thread_range` with a `nvtx3::event_attributes` object. For
+ * more information, see \ref ATTRIBUTES.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * void some_function(){
+ *    // Creates a range for the duration of `some_function`
+ *    nvtx3::thread_range r{};
+ *
+ *    while(true){
+ *       // Creates a range for every loop iteration
+ *       // `loop_range` is nested inside `r`
+ *       nvtx3::thread_range loop_range{};
+ *    }
+ * }
+ * \endcode
+ *
+ * \subsection PROCESS_RANGE Process Range
+ *
+ * `nvtx3::domain_process_range` is identical to `nvtx3::domain_thread_range`
+ * with the exception that a `domain_process_range` can be created and destroyed
+ * on different threads. This is useful to annotate spans of time that can
+ * bridge multiple threads.
+ *
+ * `nvtx3::domain_thread_range`s should be preferred unless one needs the
+ * ability to begin and end a range on different threads.
+ *
+ * \section MARKS Marks
+ *
+ * `nvtx3::mark` allows annotating an instantaneous event in an application's
+ * timeline. For example, indicating when a mutex is locked or unlocked.
+ *
+ * \code{.cpp}
+ * std::mutex global_lock;
+ * void lock_mutex(){
+ *    global_lock.lock();
+ *    // Marks an event immediately after the mutex is locked
+ *    nvtx3::mark<my_domain>("lock_mutex");
+ * }
+ * \endcode
+ *
+ * \section DOMAINS Domains
+ *
+ * Similar to C++ namespaces, Domains allow for scoping NVTX events. By default,
+ * all NVTX events belong to the "global" domain. Libraries and applications
+ * should scope their events to use a custom domain to differentiate where the
+ * events originate from.
+ *
+ * It is common for a library or application to have only a single domain and
+ * for the name of that domain to be known at compile time. Therefore, Domains
+ * in NVTX++ are represented by _tag types_.
+ *
+ * For example, to define a custom  domain, simply define a new concrete type
+ * (a `class` or `struct`) with a `static` member called `name` that contains
+ * the desired name of the domain.
+ *
+ * ```
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ * ```
+ *
+ * For any NVTX++ construct that can be scoped to a domain, the type `my_domain`
+ * can be passed as an explicit template argument to scope it to the custom
+ * domain.
+ *
+ * The tag type `nvtx3::domain::global` represents the global NVTX domain.
+ *
+ * \code{.cpp}
+ * // By default, `domain_thread_range` belongs to the global domain
+ * nvtx3::domain_thread_range<> r0{};
+ *
+ * // Alias for a `domain_thread_range` in the global domain
+ * nvtx3::thread_range r1{};
+ *
+ * // `r` belongs to the custom domain
+ * nvtx3::domain_thread_range<my_domain> r{};
+ * \endcode
+ *
+ * When using a custom domain, it is reccomended to define type aliases for NVTX
+ * constructs in the custom domain.
+ * ```
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * using my_registered_string = nvtx3::registered_string<my_domain>;
+ * using my_named_category = nvtx3::named_category<my_domain>;
+ * ```
+ *
+ * See `nvtx3::domain` for more information.
+ *
+ * \section ATTRIBUTES Event Attributes
+ *
+ * NVTX events can be customized with various attributes to provide additional
+ * information (such as a custom message) or to control visualization of the
+ * event (such as the color used). These attributes can be specified per-event
+ * via arguments to a `nvtx3::event_attributes` object.
+ *
+ * NVTX events can be customized via four "attributes":
+ * - \ref COLOR : color used to visualize the event in tools.
+ * - \ref MESSAGES :  Custom message string.
+ * - \ref PAYLOAD :  User-defined numerical value.
+ * - \ref CATEGORY : Intra-domain grouping.
+ *
+ * It is possible to construct a `nvtx3::event_attributes` from any number of
+ * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload,
+ * nvtx3::category) in any order. If an attribute is not specified, a tool
+ * specific default value is used. See `nvtx3::event_attributes` for more
+ * information.
+ *
+ * \code{.cpp}
+ * // Custom color, message
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      "message"};
+ *
+ * // Custom color, message, payload, category
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      nvtx3::payload{42},
+ *                      "message",
+ *                      nvtx3::category{1}};
+ *
+ * // Arguments can be in any order
+ * event_attributes attr{nvtx3::payload{42},
+ *                      nvtx3::category{1},
+ *                      "message",
+ *                      nvtx3::rgb{127, 255, 0}};
+ *
+ * // "First wins" with multiple arguments of the same type
+ * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload is
+ * 42 \endcode
+ *
+ * \subsection MESSAGES message
+ *
+ * A `nvtx3::message` allows associating a custom message string with an NVTX
+ * event.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Create an `event_attributes` with the custom message "my message"
+ * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}};
+ *
+ * // strings and string literals implicitly assumed to be a `nvtx3::message`
+ * nvtx3::event_attributes attr{"my message"};
+ * \endcode
+ *
+ * \subsubsection REGISTERED_MESSAGE Registered Messages
+ *
+ * Associating a `nvtx3::message` with an event requires copying the contents of
+ * the message every time the message is used, i.e., copying the entire message
+ * string. This may cause non-trivial overhead in performance sensitive code.
+ *
+ * To eliminate this overhead, NVTX allows registering a message string,
+ * yielding a "handle" that is inexpensive to copy that may be used in place of
+ * a message string. When visualizing the events, tools such as Nsight Systems
+ * will take care of mapping the message handle to its string.
+ *
+ * A message should be registered once and the handle reused throughout the rest
+ * of the application. This can be done by either explicitly creating static
+ * `nvtx3::registered_string` objects, or using the
+ * `nvtx3::registered_string::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a
+ * custom tag type with a static `message` member whose value will be the
+ * contents of the registered string.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `registered_string`
+ * static registered_string<my_domain> static_message{"my message"};
+ *
+ * // Or use construct on first use:
+ * // Define a tag type with a `message` member string to register
+ * struct my_message{ static constexpr char const* message{ "my message" }; };
+ *
+ * // Uses construct on first use to register the contents of
+ * // `my_message::message`
+ * nvtx3::registered_string<my_domain> const& msg =
+ * nvtx3::registered_string<my_domain>::get<my_message>(); \endcode
+ *
+ * \subsection COLOR color
+ *
+ * Associating a `nvtx3::color` with an event allows controlling how the event
+ * is visualized in a tool such as Nsight Systems. This is a convenient way to
+ * visually differentiate among different events.
+ *
+ * \code{.cpp}
+ * // Define a color via rgb color values
+ * nvtx3::color c{nvtx3::rgb{127, 255, 0}};
+ * nvtx3::event_attributes attr{c};
+ *
+ * // rgb color values can be passed directly to an `event_attributes`
+ * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}};
+ * \endcode
+ *
+ * \subsection CATEGORY category
+ *
+ * A `nvtx3::category` is simply an integer id that allows for fine-grain
+ * grouping of NVTX events. For example, one might use separate categories for
+ * IO, memory allocation, compute, etc.
+ *
+ * \code{.cpp}
+ * nvtx3::event_attributes{nvtx3::category{1}};
+ * \endcode
+ *
+ * \subsubsection NAMED_CATEGORIES Named Categories
+ *
+ * Associates a `name` string with a category `id` to help differentiate among
+ * categories.
+ *
+ * For any given category id `Id`, a `named_category{Id, "name"}` should only
+ * be constructed once and reused throughout an application. This can be done by
+ * either explicitly creating static `nvtx3::named_category` objects, or using
+ * the `nvtx3::named_category::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a
+ * custom tag type with static `name` and `id` members.
+ *
+ * \code{.cpp}
+ * // Explicitly constructed, static `named_category`
+ * static nvtx3::named_category static_category{42, "my category"};
+ *
+ * // OR use construct on first use:
+ * // Define a tag type with `name` and `id` members
+ * struct my_category{
+ *    static constexpr char const* name{"my category"}; // category name
+ *    static constexpr category::id_type id{42}; // category id
+ * };
+ *
+ * // Use construct on first use to name the category id `42`
+ * // with name "my category"
+ * nvtx3::named_category const& my_category =
+ * named_category<my_domain>::get<my_category>();
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::event_attributes attr{my_category};
+ * \endcode
+ *
+ * \subsection PAYLOAD payload
+ *
+ * Allows associating a user-defined numerical value with an event.
+ *
+ * ```
+ * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
+ * from
+ *                                                 // the `int32_t` value 42
+ * ```
+ *
+ *
+ * \section EXAMPLE Example
+ *
+ * Putting it all together:
+ * \code{.cpp}
+ * // Define a custom domain tag type
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // Define a named category tag type
+ * struct my_category{
+ *    static constexpr char const* name{"my category"};
+ *    static constexpr uint32_t id{42};
+ * };
+ *
+ * // Define a registered string tag type
+ * struct my_message{ static constexpr char const* message{"my message"}; };
+ *
+ * // For convenience, use aliases for domain scoped objects
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * using my_registered_string = nvtx3::registered_string<my_domain>;
+ * using my_named_category = nvtx3::named_category<my_domain>;
+ *
+ * // Default values for all attributes
+ * nvtx3::event_attributes attr{};
+ * my_thread_range r0{attr};
+ *
+ * // Custom (unregistered) message, and unnamed category
+ * nvtx3::event_attributes attr1{"message", nvtx3::category{2}};
+ * my_thread_range r1{attr1};
+ *
+ * // Alternatively, pass arguments of `event_attributes` ctor directly to
+ * // `my_thread_range`
+ * my_thread_range r2{"message", nvtx3::category{2}};
+ *
+ * // construct on first use a registered string
+ * auto msg = my_registered_string::get<my_message>();
+ *
+ * // construct on first use a named category
+ * auto category = my_named_category::get<my_category>();
+ *
+ * // Use registered string and named category
+ * my_thread_range r3{msg, category, nvtx3::rgb{127, 255, 0},
+ *                    nvtx3::payload{42}};
+ *
+ * // Any number of arguments in any order
+ * my_thread_range r{nvtx3::rgb{127, 255,0}, msg};
+ *
+ * \endcode
+ * \section MACROS Convenience Macros
+ *
+ * Oftentimes users want to quickly and easily add NVTX ranges to their library
+ * or application to aid in profiling and optimization.
+ *
+ * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and
+ * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an
+ * `nvtx3::domain_thread_range` with the name of the enclosing function as the
+ * range's message.
+ *
+ * \code{.cpp}
+ * void some_function(){
+ *    // Automatically generates an NVTX range for the duration of the function
+ *    // using "some_function" as the event's message.
+ *    NVTX3_FUNC_RANGE();
+ * }
+ * \endcode
+ *
+ */
+
+/* Temporary helper #defines, removed with #undef at end of header */
+
+/* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX,
+ * where "X" is the major version number. */
+#define NVTX3_CONCAT(A, B) A##B
+#define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION)
+#define NVTX3_VERSION_NAMESPACE NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR)
+
+/* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces
+ * in each minor version by making a macro to use unconditionally, which
+ * resolves to "inline" or nothing as appropriate. */
+#if defined(NVTX3_INLINE_THIS_VERSION)
+#define NVTX3_INLINE_IF_REQUESTED inline
+#else
+#define NVTX3_INLINE_IF_REQUESTED
+#endif
+
+/* Enables the use of constexpr when support for C++14 relaxed constexpr
+ * is present.
+ *
+ * Initializing a legacy-C (i.e., no constructor) union member requires
+ * initializing in the constructor body. Non-empty constexpr constructors
+ * require C++14 relaxed constexpr.  In strict C++11 compilation, fall back
+ * to using non-constexpr constructors for classes with union members.
+ */
+#if __cpp_constexpr >= 201304L
+#define NVTX3_RELAXED_CONSTEXPR constexpr
+#else
+#define NVTX3_RELAXED_CONSTEXPR
+#endif
+
+/* Implementation sections, enclosed in guard macros for each minor version */
+
+#ifndef NVTX3_CPP_DEFINITIONS_V1_0
+#define NVTX3_CPP_DEFINITIONS_V1_0
+
+namespace nvtx3 {
+
+NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE
+{
+
+namespace detail {
+
+/**
+ * @brief Verifies if a type `T` contains a member `T::name` of type `const
+ * char*` or `const wchar_t*`.
+ *
+ * @tparam T The type to verify
+ * @return True if `T` contains a member `T::name` of type `const char*` or
+ * `const wchar_t*`.
+ */
+template <typename T>
+constexpr auto has_name_member() noexcept -> decltype(T::name, bool())
+{
+  return (std::is_same<char const*, typename std::decay<decltype(T::name)>::type>::value ||
+          std::is_same<wchar_t const*, typename std::decay<decltype(T::name)>::type>::value);
+}
+}  // namespace detail
+
+/**
+ * @brief `domain`s allow for grouping NVTX events into a single scope to
+ * differentiate them from events in other `domain`s.
+ *
+ * By default, all NVTX constructs are placed in the "global" NVTX domain.
+ *
+ * A custom `domain` may be used in order to differentiate a library's or
+ * application's NVTX events from other events.
+ *
+ * `domain`s are expected to be long-lived and unique to a library or
+ * application. As such, it is assumed a domain's name is known at compile
+ * time. Therefore, all NVTX constructs that can be associated with a domain
+ * require the domain to be specified via a *type* `DomainName` passed as an
+ * explicit template parameter.
+ *
+ * The type `domain::global` may be used to indicate that the global NVTX
+ * domain should be used.
+ *
+ * None of the C++ NVTX constructs require the user to manually construct a
+ * `domain` object. Instead, if a custom domain is desired, the user is
+ * expected to define a type `DomainName` that contains a member
+ * `DomainName::name` which resolves to either a `char const*` or `wchar_t
+ * const*`. The value of `DomainName::name` is used to name and uniquely
+ * identify the custom domain.
+ *
+ * Upon the first use of an NVTX construct associated with the type
+ * `DomainName`, the "construct on first use" pattern is used to construct a
+ * function local static `domain` object. All future NVTX constructs
+ * associated with `DomainType` will use a reference to the previously
+ * constructed `domain` object. See `domain::get`.
+ *
+ * Example:
+ * ```
+ * // The type `my_domain` defines a `name` member used to name and identify
+ * the
+ * // `domain` object identified by `my_domain`.
+ * struct my_domain{ static constexpr char const* name{"my_domain"}; };
+ *
+ * // The NVTX range `r` will be grouped with all other NVTX constructs
+ * // associated with  `my_domain`.
+ * nvtx3::domain_thread_range<my_domain> r{};
+ *
+ * // An alias can be created for a `domain_thread_range` in the custom domain
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * my_thread_range my_range{};
+ *
+ * // `domain::global` indicates that the global NVTX domain is used
+ * nvtx3::domain_thread_range<domain::global> r2{};
+ *
+ * // For convenience, `nvtx3::thread_range` is an alias for a range in the
+ * // global domain
+ * nvtx3::thread_range r3{};
+ * ```
+ */
+class domain {
+ public:
+  domain(domain const&) = delete;
+  domain& operator=(domain const&) = delete;
+  domain(domain&&)                 = delete;
+  domain& operator=(domain&&) = delete;
+
+  /**
+   * @brief Returns reference to an instance of a function local static
+   * `domain` object.
+   *
+   * Uses the "construct on first use" idiom to safely ensure the `domain`
+   * object is initialized exactly once upon first invocation of
+   * `domain::get<DomainName>()`. All following invocations will return a
+   * reference to the previously constructed `domain` object. See
+   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+   *
+   * None of the constructs in this header require the user to directly invoke
+   * `domain::get`. It is automatically invoked when constructing objects like
+   * a `domain_thread_range` or `category`. Advanced users may wish to use
+   * `domain::get` for the convenience of the "construct on first use" idiom
+   * when using domains with their own use of the NVTX C API.
+   *
+   * This function is threadsafe as of C++11. If two or more threads call
+   * `domain::get<DomainName>` concurrently, exactly one of them is guaranteed
+   * to construct the `domain` object and the other(s) will receive a
+   * reference to the object after it is fully constructed.
+   *
+   * The domain's name is specified via the type `DomainName` pass as an
+   * explicit template parameter. `DomainName` is required to contain a
+   * member `DomainName::name` that resolves to either a `char const*` or
+   * `wchar_t const*`. The value of `DomainName::name` is used to name and
+   * uniquely identify the `domain`.
+   *
+   * Example:
+   * ```
+   * // The type `my_domain` defines a `name` member used to name and identify
+   * // the `domain` object identified by `my_domain`.
+   * struct my_domain{ static constexpr char const* name{"my domain"}; };
+   *
+   * auto D = domain::get<my_domain>(); // First invocation constructs a
+   *                                    // `domain` with the name "my domain"
+   *
+   * auto D1 = domain::get<my_domain>(); // Simply returns reference to
+   *                                     // previously constructed `domain`.
+   * ```
+   *
+   * @tparam DomainName Type that contains a `DomainName::name` member used to
+   * name the `domain` object.
+   * @return Reference to the `domain` corresponding to the type `DomainName`.
+   */
+  template <typename DomainName>
+  static domain const& get()
+  {
+    static_assert(detail::has_name_member<DomainName>(),
+                  "Type used to identify a domain must contain a name member of"
+                  "type const char* or const wchar_t*");
+    static domain const d{DomainName::name};
+    return d;
+  }
+
+  /**
+   * @brief Conversion operator to `nvtxDomainHandle_t`.
+   *
+   * Allows transparently passing a domain object into an API expecting a
+   * native `nvtxDomainHandle_t` object.
+   */
+  operator nvtxDomainHandle_t() const noexcept { return _domain; }
+
+  /**
+   * @brief Tag type for the "global" NVTX domain.
+   *
+   * This type may be passed as a template argument to any function/class
+   * expecting a type to identify a domain to indicate that the global domain
+   * should be used.
+   *
+   * All NVTX events in the global domain across all libraries and
+   * applications will be grouped together.
+   *
+   */
+  struct global {
+  };
+
+ private:
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(std::string const& name) noexcept : domain{name.c_str()} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {}
+
+  /**
+   * @brief Default constructor creates a `domain` representing the
+   * "global" NVTX domain.
+   *
+   * All events not associated with a custom `domain` are grouped in the
+   * "global" NVTX domain.
+   *
+   */
+  domain() = default;
+
+  /**
+   * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object.
+   *
+   * No currently-available tools attempt to free domain resources when the
+   * nvtxDomainDestroy function is called, due to the thread-safety and
+   * efficiency challenges of freeing thread-local storage for other threads.
+   * Since libraries may be disallowed from introducing static destructors,
+   * and destroying the domain is likely to have no effect, the destructor
+   * for `domain` intentionally chooses to not destroy the domain.
+   *
+   * In a situation where domain destruction is necessary, either manually
+   * call nvtxDomainDestroy on the domain's handle, or make a class that
+   * derives from `domain` and calls nvtxDomainDestroy in its destructor.
+   */
+  ~domain() = default;
+
+ private:
+  nvtxDomainHandle_t const _domain{};  ///< The `domain`s NVTX handle
+};
+
+/**
+ * @brief Returns reference to the `domain` object that represents the global
+ * NVTX domain.
+ *
+ * This specialization for `domain::global` returns a default constructed,
+ * `domain` object for use when the "global" domain is desired.
+ *
+ * All NVTX events in the global domain across all libraries and applications
+ * will be grouped together.
+ *
+ * @return Reference to the `domain` corresponding to the global NVTX domain.
+ *
+ */
+template <>
+inline domain const& domain::get<domain::global>()
+{
+  static domain const d{};
+  return d;
+}
+
+/**
+ * @brief Indicates the values of the red, green, blue color channels for
+ * a rgb color code.
+ *
+ */
+struct rgb {
+  /// Type used for component values
+  using component_type = uint8_t;
+
+  /**
+   * @brief Construct a rgb with red, green, and blue channels
+   * specified by `red_`, `green_`, and `blue_`, respectively.
+   *
+   * Valid values are in the range `[0,255]`.
+   *
+   * @param red_ Value of the red channel
+   * @param green_ Value of the green channel
+   * @param blue_ Value of the blue channel
+   */
+  constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept
+    : red{red_}, green{green_}, blue{blue_}
+  {
+  }
+
+  component_type const red{};    ///< Red channel value
+  component_type const green{};  ///< Green channel value
+  component_type const blue{};   ///< Blue channel value
+};
+
+/**
+ * @brief Indicates the value of the alpha, red, green, and blue color
+ * channels for an argb color code.
+ *
+ */
+struct argb final : rgb {
+  /**
+   * @brief Construct an argb with alpha, red, green, and blue channels
+   * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively.
+   *
+   * Valid values are in the range `[0,255]`.
+   *
+   * @param alpha_  Value of the alpha channel (opacity)
+   * @param red_  Value of the red channel
+   * @param green_  Value of the green channel
+   * @param blue_  Value of the blue channel
+   *
+   */
+  constexpr argb(component_type alpha_,
+                 component_type red_,
+                 component_type green_,
+                 component_type blue_) noexcept
+    : rgb{red_, green_, blue_}, alpha{alpha_}
+  {
+  }
+
+  component_type const alpha{};  ///< Alpha channel value
+};
+
+/**
+ * @brief Represents a custom color that can be associated with an NVTX event
+ * via it's `event_attributes`.
+ *
+ * Specifying colors for NVTX events is a convenient way to visually
+ * differentiate among different events in a visualization tool such as Nsight
+ * Systems.
+ *
+ */
+class color {
+ public:
+  /// Type used for the color's value
+  using value_type = uint32_t;
+
+  /**
+   * @brief Constructs a `color` using the value provided by `hex_code`.
+   *
+   * `hex_code` is expected to be a 4 byte argb hex code.
+   *
+   * The most significant byte indicates the value of the alpha channel
+   * (opacity) (0-255)
+   *
+   * The next byte indicates the value of the red channel (0-255)
+   *
+   * The next byte indicates the value of the green channel (0-255)
+   *
+   * The least significant byte indicates the value of the blue channel
+   * (0-255)
+   *
+   * @param hex_code The hex code used to construct the `color`
+   */
+  constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {}
+
+  /**
+   * @brief Construct a `color` using the alpha, red, green, blue components
+   * in `argb`.
+   *
+   * @param argb The alpha, red, green, blue components of the desired `color`
+   */
+  constexpr color(argb argb) noexcept
+    : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, argb.blue)}
+  {
+  }
+
+  /**
+   * @brief Construct a `color` using the red, green, blue components in
+   * `rgb`.
+   *
+   * Uses maximum value for the alpha channel (opacity) of the `color`.
+   *
+   * @param rgb The red, green, blue components of the desired `color`
+   */
+  constexpr color(rgb rgb) noexcept
+    : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)}
+  {
+  }
+
+  /**
+   * @brief Returns the `color`s argb hex code
+   *
+   */
+  constexpr value_type get_value() const noexcept { return _value; }
+
+  /**
+   * @brief Return the NVTX color type of the color.
+   *
+   */
+  constexpr nvtxColorType_t get_type() const noexcept { return _type; }
+
+  color()             = delete;
+  ~color()            = default;
+  color(color const&) = default;
+  color& operator=(color const&) = default;
+  color(color&&)                 = default;
+  color& operator=(color&&) = default;
+
+ private:
+  /**
+   * @brief Constructs an unsigned, 4B integer from the component bytes in
+   * most to least significant byte order.
+   *
+   */
+  constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3,
+                                                    uint8_t byte2,
+                                                    uint8_t byte1,
+                                                    uint8_t byte0) noexcept
+  {
+    return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0};
+  }
+
+  value_type const _value{};                     ///< color's argb color code
+  nvtxColorType_t const _type{NVTX_COLOR_ARGB};  ///< NVTX color type code
+};
+
+/**
+ * @brief Object for intra-domain grouping of NVTX events.
+ *
+ * A `category` is simply an integer id that allows for fine-grain grouping of
+ * NVTX events. For example, one might use separate categories for IO, memory
+ * allocation, compute, etc.
+ *
+ * Example:
+ * \code{.cpp}
+ * nvtx3::category cat1{1};
+ *
+ * // Range `r1` belongs to the category identified by the value `1`.
+ * nvtx3::thread_range r1{cat1};
+ *
+ * // Range `r2` belongs to the same category as `r1`
+ * nvtx3::thread_range r2{nvtx3::category{1}};
+ * \endcode
+ *
+ * To associate a name string with a category id, see `named_category`.
+ *
+ */
+class category {
+ public:
+  /// Type used for `category`s integer id.
+  using id_type = uint32_t;
+
+  /**
+   * @brief Construct a `category` with the specified `id`.
+   *
+   * The `category` will be unnamed and identified only by its `id` value.
+   *
+   * All `category` objects sharing the same `id` are equivalent.
+   *
+   * @param[in] id The `category`'s identifying value
+   */
+  constexpr explicit category(id_type id) noexcept : id_{id} {}
+
+  /**
+   * @brief Returns the id of the category.
+   *
+   */
+  constexpr id_type get_id() const noexcept { return id_; }
+
+  category()                = delete;
+  ~category()               = default;
+  category(category const&) = default;
+  category& operator=(category const&) = default;
+  category(category&&)                 = default;
+  category& operator=(category&&) = default;
+
+ private:
+  id_type const id_{};  ///< category's unique identifier
+};
+
+/**
+ * @brief A `category` with an associated name string.
+ *
+ * Associates a `name` string with a category `id` to help differentiate among
+ * categories.
+ *
+ * For any given category id `Id`, a `named_category(Id, "name")` should only
+ * be constructed once and reused throughout an application. This can be done
+ * by either explicitly creating static `named_category` objects, or using the
+ * `named_category::get` construct on first use helper (recommended).
+ *
+ * Creating two or more `named_category` objects with the same value for `id`
+ * in the same domain results in undefined behavior.
+ *
+ * Similarly, behavior is undefined when a `named_category` and `category`
+ * share the same value of `id`.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `named_category`
+ * static nvtx3::named_category static_category{42, "my category"};
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::thread_range r{static_category};
+ *
+ * // OR use construct on first use:
+ *
+ * // Define a type with `name` and `id` members
+ * struct my_category{
+ *    static constexpr char const* name{"my category"}; // category name
+ *    static constexpr category::id_type id{42}; // category id
+ * };
+ *
+ * // Use construct on first use to name the category id `42`
+ * // with name "my category"
+ * auto my_category = named_category<my_domain>::get<my_category>();
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::thread_range r{my_category};
+ * \endcode
+ *
+ * `named_category`'s association of a name to a category id is local to the
+ * domain specified by the type `D`. An id may have a different name in
+ * another domain.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain` to
+ * which the `named_category` belongs. Else, `domain::global` to  indicate
+ * that the global NVTX domain should be used.
+ */
+template <typename D = domain::global>
+class named_category final : public category {
+ public:
+  /**
+   * @brief Returns a global instance of a `named_category` as a
+   * function-local static.
+   *
+   * Creates a `named_category` with name and id specified by the contents of
+   * a type `C`. `C::name` determines the name and `C::id` determines the
+   * category id.
+   *
+   * This function is useful for constructing a named `category` exactly once
+   * and reusing the same instance throughout an application.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Define a type with `name` and `id` members
+   * struct my_category{
+   *    static constexpr char const* name{"my category"}; // category name
+   *    static constexpr uint32_t id{42}; // category id
+   * };
+   *
+   * // Use construct on first use to name the category id `42`
+   * // with name "my category"
+   * auto cat = named_category<my_domain>::get<my_category>();
+   *
+   * // Range `r` associated with category id `42`
+   * nvtx3::thread_range r{cat};
+   * \endcode
+   *
+   * Uses the "construct on first use" idiom to safely ensure the `category`
+   * object is initialized exactly once. See
+   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+   *
+   * @tparam C Type containing a member `C::name` that resolves  to either a
+   * `char const*` or `wchar_t const*` and `C::id`.
+   */
+  template <typename C>
+  static named_category<D> const& get() noexcept
+  {
+    static_assert(detail::has_name_member<C>(),
+                  "Type used to name a category must contain a name member.");
+    static named_category<D> const category{C::id, C::name};
+    return category;
+  }
+  /**
+   * @brief Construct a `category` with the specified `id` and `name`.
+   *
+   * The name `name` will be registered with `id`.
+   *
+   * Every unique value of `id` should only be named once.
+   *
+   * @param[in] id The category id to name
+   * @param[in] name The name to associated with `id`
+   */
+  named_category(id_type id, char const* name) noexcept : category{id}
+  {
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryA(domain::get<D>(), get_id(), name);
+#else
+    (void)id;
+    (void)name;
+#endif
+  };
+
+  /**
+   * @brief Construct a `category` with the specified `id` and `name`.
+   *
+   * The name `name` will be registered with `id`.
+   *
+   * Every unique value of `id` should only be named once.
+   *
+   * @param[in] id The category id to name
+   * @param[in] name The name to associated with `id`
+   */
+  named_category(id_type id, wchar_t const* name) noexcept : category{id}
+  {
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryW(domain::get<D>(), get_id(), name);
+#else
+    (void)id;
+    (void)name;
+#endif
+  };
+};
+
+/**
+ * @brief A message registered with NVTX.
+ *
+ * Normally, associating a `message` with an NVTX event requires copying the
+ * contents of the message string. This may cause non-trivial overhead in
+ * highly performance sensitive regions of code.
+ *
+ * message registration is an optimization to lower the overhead of
+ * associating a message with an NVTX event. Registering a message yields a
+ * handle that is inexpensive to copy that may be used in place of a message
+ * string.
+ *
+ * A particular message should only be registered once and the handle
+ * reused throughout the rest of the application. This can be done by either
+ * explicitly creating static `registered_string` objects, or using the
+ * `registered_string::get` construct on first use helper (recommended).
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `registered_string`
+ * static registered_string<my_domain> static_message{"message"};
+ *
+ * // "message" is associated with the range `r`
+ * nvtx3::thread_range r{static_message};
+ *
+ * // Or use construct on first use:
+ *
+ * // Define a type with a `message` member that defines the contents of the
+ * // registered string
+ * struct my_message{ static constexpr char const* message{ "my message" }; };
+ *
+ * // Uses construct on first use to register the contents of
+ * // `my_message::message`
+ * auto msg = registered_string<my_domain>::get<my_message>();
+ *
+ * // "my message" is associated with the range `r`
+ * nvtx3::thread_range r{msg};
+ * \endcode
+ *
+ * `registered_string`s are local to a particular domain specified via
+ * the type `D`.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain` to
+ * which the `registered_string` belongs. Else, `domain::global` to  indicate
+ * that the global NVTX domain should be used.
+ */
+template <typename D = domain::global>
+class registered_string {
+ public:
+  /**
+   * @brief Returns a global instance of a `registered_string` as a function
+   * local static.
+   *
+   * Provides a convenient way to register a message with NVTX without having
+   * to explicitly register the message.
+   *
+   * Upon first invocation, constructs a `registered_string` whose contents
+   * are specified by `message::message`.
+   *
+   * All future invocations will return a reference to the object constructed
+   * in the first invocation.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Define a type with a `message` member that defines the contents of the
+   * // registered string
+   * struct my_message{ static constexpr char const* message{ "my message" };
+   * };
+   *
+   * // Uses construct on first use to register the contents of
+   * // `my_message::message`
+   * auto msg = registered_string<my_domain>::get<my_message>();
+   *
+   * // "my message" is associated with the range `r`
+   * nvtx3::thread_range r{msg};
+   * \endcode
+   *
+   * @tparam M Type required to contain a member `M::message` that
+   * resolves to either a `char const*` or `wchar_t const*` used as the
+   * registered string's contents.
+   * @return Reference to a `registered_string` associated with the type `M`.
+   */
+  template <typename M>
+  static registered_string<D> const& get() noexcept
+  {
+    static registered_string<D> const registered_string{M::message};
+    return registered_string;
+  }
+
+  /**
+   * @brief Constructs a `registered_string` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_string(char const* msg) noexcept
+    : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)}
+  {
+  }
+
+  /**
+   * @brief Constructs a `registered_string` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_string(std::string const& msg) noexcept : registered_string{msg.c_str()} {}
+
+  /**
+   * @brief Constructs a `registered_string` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_string(wchar_t const* msg) noexcept
+    : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)}
+  {
+  }
+
+  /**
+   * @brief Constructs a `registered_string` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_string(std::wstring const& msg) noexcept : registered_string{msg.c_str()} {}
+
+  /**
+   * @brief Returns the registered string's handle
+   *
+   */
+  nvtxStringHandle_t get_handle() const noexcept { return handle_; }
+
+  registered_string()                         = delete;
+  ~registered_string()                        = default;
+  registered_string(registered_string const&) = default;
+  registered_string& operator=(registered_string const&) = default;
+  registered_string(registered_string&&)                 = default;
+  registered_string& operator=(registered_string&&) = default;
+
+ private:
+  nvtxStringHandle_t const handle_{};  ///< The handle returned from
+                                       ///< registering the message with NVTX
+};
+
+/**
+ * @brief Allows associating a message string with an NVTX event via
+ * its `EventAttribute`s.
+ *
+ * Associating a `message` with an NVTX event through its `event_attributes`
+ * allows for naming events to easily differentiate them from other events.
+ *
+ * Every time an NVTX event is created with an associated `message`, the
+ * contents of the message string must be copied.  This may cause non-trivial
+ * overhead in highly performance sensitive sections of code. Use of a
+ * `nvtx3::registered_string` is recommended in these situations.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Creates an `event_attributes` with message "message 0"
+ * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}};
+ *
+ * // `range0` contains message "message 0"
+ * nvtx3::thread_range range0{attr0};
+ *
+ * // `std::string` and string literals are implicitly assumed to be
+ * // the contents of an `nvtx3::message`
+ * // Creates an `event_attributes` with message "message 1"
+ * nvtx3::event_attributes attr1{"message 1"};
+ *
+ * // `range1` contains message "message 1"
+ * nvtx3::thread_range range1{attr1};
+ *
+ * // `range2` contains message "message 2"
+ * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}};
+ *
+ * // `std::string` and string literals are implicitly assumed to be
+ * // the contents of an `nvtx3::message`
+ * // `range3` contains message "message 3"
+ * nvtx3::thread_range range3{"message 3"};
+ * \endcode
+ */
+class message {
+ public:
+  using value_type = nvtxMessageValue_t;
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII}
+  {
+    value_.ascii = msg;
+  }
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  message(std::string const& msg) noexcept : message{msg.c_str()} {}
+
+  /**
+   * @brief Disallow construction for `std::string` r-value
+   *
+   * `message` is a non-owning type and therefore cannot take ownership of an
+   * r-value. Therefore, constructing from an r-value is disallowed to prevent
+   * a dangling pointer.
+   *
+   */
+  message(std::string&&) = delete;
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE}
+  {
+    value_.unicode = msg;
+  }
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  message(std::wstring const& msg) noexcept : message{msg.c_str()} {}
+
+  /**
+   * @brief Disallow construction for `std::wstring` r-value
+   *
+   * `message` is a non-owning type and therefore cannot take ownership of an
+   * r-value. Therefore, constructing from an r-value is disallowed to prevent
+   * a dangling pointer.
+   *
+   */
+  message(std::wstring&&) = delete;
+
+  /**
+   * @brief Construct a `message` from a `registered_string`.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the `registered_string` belongs. Else, `domain::global` to
+   * indicate that the global NVTX domain should be used.
+   * @param msg The message that has already been registered with NVTX.
+   */
+  template <typename D>
+  NVTX3_RELAXED_CONSTEXPR message(registered_string<D> const& msg) noexcept
+    : type_{NVTX_MESSAGE_TYPE_REGISTERED}
+  {
+    value_.registered = msg.get_handle();
+  }
+
+  /**
+   * @brief Return the union holding the value of the message.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
+
+  /**
+   * @brief Return the type information about the value the union holds.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { return type_; }
+
+ private:
+  nvtxMessageType_t const type_{};  ///< message type
+  nvtxMessageValue_t value_{};      ///< message contents
+};
+
+/**
+ * @brief A numerical value that can be associated with an NVTX event via
+ * its `event_attributes`.
+ *
+ * Example:
+ * ```
+ * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
+ * from
+ *                                                 // the `int32_t` value 42
+ *
+ * // `range0` will have an int32_t payload of 42
+ * nvtx3::thread_range range0{attr};
+ *
+ * // range1 has double payload of 3.14
+ * nvtx3::thread_range range1{ nvtx3::payload{3.14} };
+ * ```
+ */
+class payload {
+ public:
+  using value_type = typename nvtxEventAttributes_v2::payload_t;
+
+  /**
+   * @brief Construct a `payload` from a signed, 8 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{}
+  {
+    value_.llValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a signed, 4 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{}
+  {
+    value_.iValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from an unsigned, 8 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{}
+  {
+    value_.ullValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from an unsigned, 4 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{}
+  {
+    value_.uiValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a single-precision floating point
+   * value.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{}
+  {
+    value_.fValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a double-precision floating point
+   * value.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept
+    : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{}
+  {
+    value_.dValue = value;
+  }
+
+  /**
+   * @brief Return the union holding the value of the payload
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
+
+  /**
+   * @brief Return the information about the type the union holds.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { return type_; }
+
+ private:
+  nvtxPayloadType_t const type_;  ///< Type of the payload value
+  value_type value_;              ///< Union holding the payload value
+};
+
+/**
+ * @brief Describes the attributes of a NVTX event.
+ *
+ * NVTX events can be customized via four "attributes":
+ *
+ * - color:    color used to visualize the event in tools such as Nsight
+ *             Systems. See `color`.
+ * - message:  Custom message string. See `message`.
+ * - payload:  User-defined numerical value. See `payload`.
+ * - category: Intra-domain grouping. See `category`.
+ *
+ * These component attributes are specified via an `event_attributes` object.
+ * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and
+ * `nvtx3::category` for how these individual attributes are constructed.
+ *
+ * While it is possible to specify all four attributes, it is common to want
+ * to only specify a subset of attributes and use default values for the
+ * others. For convenience, `event_attributes` can be constructed from any
+ * number of attribute components in any order.
+ *
+ * Example:
+ * \code{.cpp}
+ * event_attributes attr{}; // No arguments, use defaults for all attributes
+ *
+ * event_attributes attr{"message"}; // Custom message, rest defaulted
+ *
+ * // Custom color & message
+ * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}};
+ *
+ * /// Custom color & message, can use any order of arguments
+ * event_attributes attr{nvtx3::rgb{127, 255, 0}, "message"};
+ *
+ *
+ * // Custom color, message, payload, category
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      "message",
+ *                      nvtx3::payload{42},
+ *                      nvtx3::category{1}};
+ *
+ * // Custom color, message, payload, category, can use any order of arguments
+ * event_attributes attr{nvtx3::payload{42},
+ *                      nvtx3::category{1},
+ *                      "message",
+ *                      nvtx3::rgb{127, 255, 0}};
+ *
+ * // Multiple arguments of the same type are allowed, but only the first is
+ * // used. All others are ignored
+ * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload
+ * is 42
+ *
+ * // Range `r` will be customized according the attributes in `attr`
+ * nvtx3::thread_range r{attr};
+ *
+ * // For convenience, the arguments that can be passed to the
+ * `event_attributes`
+ * // constructor may be passed to the `domain_thread_range` contructor where
+ * // they will be forwarded to the `EventAttribute`s constructor
+ * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
+ * \endcode
+ *
+ */
+class event_attributes {
+ public:
+  using value_type = nvtxEventAttributes_t;
+
+  /**
+   * @brief Default constructor creates an `event_attributes` with no
+   * category, color, payload, nor message.
+   */
+  constexpr event_attributes() noexcept
+    : attributes_{
+        NVTX_VERSION,                   // version
+        sizeof(nvtxEventAttributes_t),  // size
+        0,                              // category
+        NVTX_COLOR_UNKNOWN,             // color type
+        0,                              // color value
+        NVTX_PAYLOAD_UNKNOWN,           // payload type
+        0,                              // reserved 4B
+        0,                              // payload value (union)
+        NVTX_MESSAGE_UNKNOWN,           // message type
+        0                               // message value (union)
+      }
+  {
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `category`.
+   *
+   * Sets the value of the `EventAttribute`s category based on `c` and
+   * forwards the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(category const& c, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.category = c.get_id();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `color`.
+   *
+   * Sets the value of the `EventAttribute`s color based on `c` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(color const& c, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.color     = c.get_value();
+    attributes_.colorType = c.get_type();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `payload`.
+   *
+   * Sets the value of the `EventAttribute`s payload based on `p` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(payload const& p, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.payload     = p.get_value();
+    attributes_.payloadType = p.get_type();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `message`.
+   *
+   * Sets the value of the `EventAttribute`s message based on `m` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(message const& m, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.message     = m.get_value();
+    attributes_.messageType = m.get_type();
+  }
+
+  ~event_attributes()                       = default;
+  event_attributes(event_attributes const&) = default;
+  event_attributes& operator=(event_attributes const&) = default;
+  event_attributes(event_attributes&&)                 = default;
+  event_attributes& operator=(event_attributes&&) = default;
+
+  /**
+   * @brief Get raw pointer to underlying NVTX attributes object.
+   *
+   */
+  constexpr value_type const* get() const noexcept { return &attributes_; }
+
+ private:
+  value_type attributes_{};  ///< The NVTX attributes structure
+};
+
+/**
+ * @brief A RAII object for creating a NVTX range local to a thread within a
+ * domain.
+ *
+ * When constructed, begins a nested NVTX range on the calling thread in the
+ * specified domain. Upon destruction, ends the NVTX range.
+ *
+ * Behavior is undefined if a `domain_thread_range` object is
+ * created/destroyed on different threads.
+ *
+ * `domain_thread_range` is neither moveable nor copyable.
+ *
+ * `domain_thread_range`s may be nested within other ranges.
+ *
+ * The domain of the range is specified by the template type parameter `D`.
+ * By default, the `domain::global` is used, which scopes the range to the
+ * global NVTX domain. The convenience alias `thread_range` is provided for
+ * ranges scoped to the global domain.
+ *
+ * A custom domain can be defined by creating a type, `D`, with a static
+ * member `D::name` whose value is used to name the domain associated with
+ * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*`
+ *
+ * Example:
+ * ```
+ * // Define a type `my_domain` with a member `name` used to name the domain
+ * // associated with the type `my_domain`.
+ * struct my_domain{
+ *    static constexpr const char * name{"my domain"};
+ * };
+ * ```
+ *
+ * Usage:
+ * ```
+ * nvtx3::domain_thread_range<> r0{"range 0"}; // Range in global domain
+ *
+ * nvtx3::thread_range r1{"range 1"}; // Alias for range in global domain
+ *
+ * nvtx3::domain_thread_range<my_domain> r2{"range 2"}; // Range in custom
+ * domain
+ *
+ * // specify an alias to a range that uses a custom domain
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ *
+ * my_thread_range r3{"range 3"}; // Alias for range in custom domain
+ * ```
+ */
+template <class D = domain::global>
+class domain_thread_range {
+ public:
+  /**
+   * @brief Construct a `domain_thread_range` with the specified
+   * `event_attributes`
+   *
+   * Example:
+   * ```
+   * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+   * nvtx3::domain_thread_range<> range{attr}; // Creates a range with message
+   * contents
+   *                                    // "msg" and green color
+   * ```
+   *
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the range.
+   */
+  explicit domain_thread_range(event_attributes const& attr) noexcept
+  {
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePushEx(domain::get<D>(), attr.get());
+#else
+    (void)attr;
+#endif
+  }
+
+  /**
+   * @brief Constructs a `domain_thread_range` from the constructor arguments
+   * of an `event_attributes`.
+   *
+   * Forwards the arguments `first, args...` to construct an
+   * `event_attributes` object. The `event_attributes` object is then
+   * associated with the `domain_thread_range`.
+   *
+   * For more detail, see `event_attributes` documentation.
+   *
+   * Example:
+   * ```
+   * // Creates a range with message "message" and green color
+   * nvtx3::domain_thread_range<> r{"message", nvtx3::rgb{127,255,0}};
+   * ```
+   *
+   * @note To prevent making needless copies of `event_attributes` objects,
+   * this constructor is disabled when the first argument is an
+   * `event_attributes` object, instead preferring the explicit
+   * `domain_thread_range(event_attributes const&)` constructor.
+   *
+   * @param[in] first First argument to forward to the `event_attributes`
+   * constructor.
+   * @param[in] args Variadic parameter pack of additional arguments to
+   * forward.
+   *
+   */
+  template <typename First,
+            typename... Args,
+            typename = typename std::enable_if<
+              !std::is_same<event_attributes, typename std::decay<First>>::value>>
+  explicit domain_thread_range(First const& first, Args const&... args) noexcept
+    : domain_thread_range{event_attributes{first, args...}}
+  {
+  }
+
+  /**
+   * @brief Default constructor creates a `domain_thread_range` with no
+   * message, color, payload, nor category.
+   *
+   */
+  domain_thread_range() : domain_thread_range{event_attributes{}} {}
+
+  /**
+   * @brief Delete `operator new` to disallow heap allocated objects.
+   *
+   * `domain_thread_range` must follow RAII semantics to guarantee proper push/pop semantics.
+   *
+   */
+  void* operator new(std::size_t) = delete;
+
+  domain_thread_range(domain_thread_range const&) = delete;
+  domain_thread_range& operator=(domain_thread_range const&) = delete;
+  domain_thread_range(domain_thread_range&&)                 = delete;
+  domain_thread_range& operator=(domain_thread_range&&) = delete;
+
+  /**
+   * @brief Destroy the domain_thread_range, ending the NVTX range event.
+   */
+  ~domain_thread_range() noexcept
+  {
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePop(domain::get<D>());
+#endif
+  }
+};
+
+/**
+ * @brief Alias for a `domain_thread_range` in the global NVTX domain.
+ *
+ */
+using thread_range = domain_thread_range<>;
+
+/**
+ * @brief Handle used for correlating explicit range start and end events.
+ *
+ */
+struct range_handle {
+  /// Type used for the handle's value
+  using value_type = nvtxRangeId_t;
+
+  /**
+   * @brief Construct a `range_handle` from the given id.
+   *
+   */
+  constexpr explicit range_handle(value_type id) noexcept : _range_id{id} {}
+
+  /**
+   * @brief Returns the `range_handle`'s value
+   *
+   * @return value_type The handle's value
+   */
+  constexpr value_type get_value() const noexcept { return _range_id; }
+
+ private:
+  value_type _range_id{};  ///< The underlying NVTX range id
+};
+
+/**
+ * @brief Manually begin an NVTX range.
+ *
+ * Explicitly begins an NVTX range and returns a unique handle. To end the
+ * range, pass the handle to `end_range()`.
+ *
+ * `start_range/end_range` are the most explicit and lowest level APIs provided
+ * for creating ranges.  Use of `nvtx3::domain_process_range` should be
+ * preferred unless one is unable to tie the range to the lifetime of an object.
+ *
+ * Example:
+ * ```
+ * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+ * nvtx3::range_handle h = nvxt3::start_range(attr); // Manually begins a range
+ * ...
+ * nvtx3::end_range(h); // Ends the range
+ * ```
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the range belongs. Else, `domain::global` to indicate that the
+ * global NVTX domain should be used.
+ * @param[in] attr `event_attributes` that describes the desired attributes
+ * of the range.
+ * @return Unique handle to be passed to `end_range` to end the range.
+ */
+template <typename D = domain::global>
+range_handle start_range(event_attributes const& attr) noexcept
+{
+#ifndef NVTX_DISABLE
+  return range_handle{nvtxDomainRangeStartEx(domain::get<D>(), attr.get())};
+#else
+  (void)attr;
+  return range_handle{};
+#endif
+}
+
+/**
+ * @brief Manually begin an NVTX range.
+ *
+ * Explicitly begins an NVTX range and returns a unique handle. To end the
+ * range, pass the handle to `end_range()`.
+ *
+ * Forwards the arguments `first, args...` to construct an  `event_attributes`
+ * object. The `event_attributes` object is then  associated with the range.
+ *
+ * For more detail, see `event_attributes` documentation.
+ *
+ * Example:
+ * ```
+ * nvtx3::range_handle h = nvxt3::start_range("msg", nvtx3::rgb{127,255,0}); //
+ * Begin range
+ * ...
+ * nvtx3::end_range(h); // Ends the range
+ * ```
+ *
+ * `start_range/end_range` are the most explicit and lowest level APIs provided
+ * for creating ranges.  Use of `nvtx3::domain_process_range` should be
+ * preferred unless one is unable to tie the range to the lifetime of an object.
+ *
+ * @param first[in] First argument to pass to an `event_attributes`
+ * @param args[in] Variadiac parameter pack of the rest of the arguments for an
+ * `event_attributes`.
+ * @return Unique handle to be passed to `end_range` to end the range.
+ */
+template <typename First,
+          typename... Args,
+          typename = typename std::enable_if<
+            !std::is_same<event_attributes, typename std::decay<First>>::value>>
+range_handle start_range(First const& first, Args const&... args) noexcept
+{
+#ifndef NVTX_DISABLE
+  return start_range(event_attributes{first, args...});
+#else
+  (void)first;
+  return range_handle{};
+#endif
+}
+
+/**
+ * @brief Manually end the range associated with the handle `r`.
+ *
+ * Explicitly ends the NVTX range indicated by the handle `r` returned from a
+ * prior call to `start_range`. The range may end on a different thread from
+ * where it began.
+ *
+ * This function does not have a Domain tag type template parameter as the
+ * handle `r` already indicates the domain to which the range belongs.
+ *
+ * @param r Handle to a range started by a prior call to `start_range`.
+ */
+inline void end_range(range_handle r)
+{
+#ifndef NVTX_DISABLE
+  nvtxRangeEnd(r.get_value());
+#else
+  (void)r;
+#endif
+}
+
+/**
+ * @brief A RAII object for creating a NVTX range within a domain that can
+ * be created and destroyed on different threads.
+ *
+ * When constructed, begins a NVTX range in the specified domain. Upon
+ * destruction, ends the NVTX range.
+ *
+ * Similar to `nvtx3::domain_thread_range`, the only difference being that
+ * `domain_process_range` can start and end on different threads.
+ *
+ * Use of `nvtx3::domain_thread_range` should be preferred unless one needs
+ * the ability to start and end a range on different threads.
+ *
+ * `domain_process_range` is moveable, but not copyable.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the `domain_process_range` belongs. Else, `domain::global` to
+ * indicate that the global NVTX domain should be used.
+ */
+template <typename D = domain::global>
+class domain_process_range {
+ public:
+  /**
+   * @brief Construct a new domain process range object
+   *
+   * @param attr
+   */
+  explicit domain_process_range(event_attributes const& attr) noexcept
+    : handle_{new range_handle{start_range<D>(attr)}}
+  {
+  }
+
+  /**
+   * @brief Construct a new domain process range object
+   *
+   * @param first
+   * @param args
+   */
+  template <typename First,
+            typename... Args,
+            typename = typename std::enable_if<
+              !std::is_same<event_attributes, typename std::decay<First>>::value>>
+  explicit domain_process_range(First const& first, Args const&... args) noexcept
+    : domain_process_range{event_attributes{first, args...}}
+  {
+  }
+
+  /**
+   * @brief Construct a new domain process range object
+   *
+   */
+  constexpr domain_process_range() noexcept : domain_process_range{event_attributes{}} {}
+
+  /**
+   * @brief Destroy the `domain_process_range` ending the range.
+   *
+   */
+  ~domain_process_range()
+  {
+    if (handle_) { end_range(*handle_); }
+  }
+
+  /**
+   * @brief Move constructor allows taking ownership of the NVTX range from
+   * another `domain_process_range`.
+   *
+   * @param other
+   */
+  domain_process_range(domain_process_range&& other) = default;
+
+  /**
+   * @brief Move assignment operator allows taking ownership of an NVTX range
+   * from another `domain_process_range`.
+   *
+   * @param other
+   * @return domain_process_range&
+   */
+  domain_process_range& operator=(domain_process_range&& other) = default;
+
+  /// Copy construction is not allowed to prevent multiple objects from owning
+  /// the same range handle
+  domain_process_range(domain_process_range const&) = delete;
+
+  /// Copy assignment is not allowed to prevent multiple objects from owning the
+  /// same range handle
+  domain_process_range& operator=(domain_process_range const&) = delete;
+
+ private:
+  std::unique_ptr<range_handle> handle_;  ///< Range handle used to correlate
+                                          ///< the start/end of the range
+};
+
+/**
+ * @brief Alias for a `domain_process_range` in the global NVTX domain.
+ *
+ */
+using process_range = domain_process_range<>;
+
+/**
+ * @brief Annotates an instantaneous point in time with the attributes specified
+ * by `attr`.
+ *
+ * Unlike a "range", a mark is an instantaneous event in an application, e.g.,
+ * locking/unlocking a mutex.
+ *
+ * \code{.cpp}
+ * std::mutex global_lock;
+ * void lock_mutex(){
+ *    global_lock.lock();
+ *    nvtx3::mark("lock_mutex");
+ * }
+ * \endcode
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the `domain_process_range` belongs. Else, `domain::global` to
+ * indicate that the global NVTX domain should be used.
+ * @param[in] attr `event_attributes` that describes the desired attributes
+ * of the mark.
+ */
+template <typename D = domain::global>
+inline void mark(event_attributes const& attr) noexcept
+{
+#ifndef NVTX_DISABLE
+  nvtxDomainMarkEx(domain::get<D>(), attr.get());
+#else
+  (void)(attr);
+#endif
+}
+
+}  // namespace NVTX3_VERSION_NAMESPACE
+
+}  // namespace nvtx3
+
+/**
+ * @brief Convenience macro for generating a range in the specified `domain`
+ * from the lifetime of a function
+ *
+ * This macro is useful for generating an NVTX range in `domain` from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_string` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::thread_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * ```
+ * struct my_domain{static constexpr char const* name{"my_domain"};};
+ *
+ * void foo(...){
+ *    NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * ```
+ *
+ * @param[in] D Type containing `name` member used to identify the
+ * `domain` to which the `registered_string` belongs. Else,
+ * `domain::global` to  indicate that the global NVTX domain should be used.
+ */
+#ifndef NVTX_DISABLE
+#define NVTX3_V1_FUNC_RANGE_IN(D)                                                  \
+  static ::nvtx3::v1::registered_string<D> const nvtx3_func_name__{__func__};      \
+  static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+  ::nvtx3::v1::domain_thread_range<D> const nvtx3_range__{nvtx3_func_attr__};
+#else
+#define NVTX3_V1_FUNC_RANGE_IN(D)
+#endif
+
+/**
+ * @brief Convenience macro for generating a range in the global domain from the
+ * lifetime of a function.
+ *
+ * This macro is useful for generating an NVTX range in the global domain from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_string` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::thread_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * ```
+ * void foo(...){
+ *    NVTX3_FUNC_RANGE(); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * ```
+ */
+#define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global)
+
+/* When inlining this version, versioned macros must have unversioned aliases.
+ * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/
+#if defined(NVTX3_INLINE_THIS_VERSION)
+/* clang format off */
+#define NVTX3_FUNC_RANGE_IN   NVTX3_V1_FUNC_RANGE_IN
+#define NVTX3_FUNC_RANGE      NVTX3_V1_FUNC_RANGE
+/* clang format on */
+#endif
+
+#endif  // NVTX3_CPP_DEFINITIONS_V1_0
+
+/* Add functionality for new minor versions here, by copying the above section enclosed
+ * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version.  This code
+ * is an example of how additions for version 1.2 would look, indented for clarity.  Note
+ * that the versioned symbols and macros are always provided, and the unversioned symbols
+ * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header.
+ *
+ * \code{.cpp}
+ * #ifndef NVTX3_CPP_DEFINITIONS_V1_2
+ * #define NVTX3_CPP_DEFINITIONS_V1_2
+ *     namespace nvtx3 {
+ *         NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE {
+ *             class new_class {};
+ *             inline void new_function() {}
+ *         }
+ *     }
+ *
+ *     // Macros must have the major version in their names:
+ *     #define NVTX3_V1_NEW_MACRO_A() ...
+ *     #define NVTX3_V1_NEW_MACRO_B() ...
+ *
+ *     // If inlining, make aliases for the macros with the version number omitted
+ *     #if defined(NVTX3_INLINE_THIS_VERSION)
+ *         #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A
+ *         #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B
+ *     #endif
+ * #endif // NVTX3_CPP_DEFINITIONS_V1_2
+ * \endcode
+ */
+
+/* Undefine all temporarily-defined unversioned macros, which would conflict with
+ * subsequent includes of different versions of this header. */
+#undef NVTX3_CPP_VERSION_MAJOR
+#undef NVTX3_CPP_VERSION_MINOR
+#undef NVTX3_CONCAT
+#undef NVTX3_NAMESPACE_FOR
+#undef NVTX3_VERSION_NAMESPACE
+#undef NVTX3_INLINE_IF_REQUESTED
+#undef NVTX3_RELAXED_CONSTEXPR
+
+#if defined(NVTX3_INLINE_THIS_VERSION)
+#undef NVTX3_INLINE_THIS_VERSION
+#endif
diff --git a/src/include/nvtx3/nvToolsExt.h b/src/include/nvtx3/nvToolsExt.h
new file mode 100644
index 0000000000..ce4b0be651
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExt.h
@@ -0,0 +1,1470 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/** \file nvToolsExt.h
+ */
+
+/* ========================================================================= */
+/** \mainpage
+ * \tableofcontents
+ * \section INTRODUCTION Introduction
+ *
+ * The NVIDIA Tools Extension library is a set of functions that a
+ * developer can use to provide additional information to tools.
+ * The additional information is used by the tool to improve
+ * analysis and visualization of data.
+ *
+ * The library introduces close to zero overhead if no tool is
+ * attached to the application.  The overhead when a tool is
+ * attached is specific to the tool.
+ *
+ * \section INITIALIZATION_SECTION Initialization
+ *
+ * Typically the tool's library that plugs into NVTX is indirectly 
+ * loaded via enviromental properties that are platform specific. 
+ * For some platform or special cases, the user may be required 
+ * to instead explicity initialize instead though.   This can also
+ * be helpful to control when the API loads a tool's library instead
+ * of what would typically be the first function call to emit info.
+ * For these rare case, see \ref INITIALIZATION for additional information.
+ *
+ * \section MARKERS_AND_RANGES Markers and Ranges
+ *
+ * Markers and ranges are used to describe events at a specific time (markers)
+ * or over a time span (ranges) during the execution of the application
+ * respectively. 
+ *
+ * \subsection MARKERS Markers
+ * 
+ * Markers denote specific moments in time.
+ * 
+ * 
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ * 
+ * \subsection THREAD_RANGES Thread Ranges
+ *
+ * Thread ranges denote nested time ranges. Nesting is maintained per thread
+ * per domain and does not require any additional correlation mechanism. The
+ * duration of a thread range is defined by the corresponding pair of
+ * nvtxRangePush* to nvtxRangePop API calls.
+ *
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ *
+ * \subsection PROCESS_RANGES Process Ranges
+ *
+ * Process ranges denote a time span that can expose arbitrary concurrency, as 
+ * opposed to thread ranges that only support nesting. In addition the range
+ * start event can happen on a different thread than the end marker. For the 
+ * correlation of a start/end pair an unique correlation ID is used that is
+ * returned from the start API call and needs to be passed into the end API
+ * call.
+ *
+ * \subsection EVENT_ATTRIBUTES Event Attributes
+ *
+ * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide
+ * additional information for an event or to guide the tool's visualization of
+ * the data. Each of the attributes is optional and if left unused the
+ * attributes fall back to a default value. The attributes include:
+ * - color
+ * - category
+ *
+ * To specify any attribute other than the text message, the \ref
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
+ *
+ * \section DOMAINS Domains
+ *
+ * Domains enable developers to scope annotations. By default all events and
+ * annotations are in the default domain. Additional domains can be registered.
+ * This allows developers to scope markers, ranges, and resources names to
+ * avoid conflicts.
+ *
+ * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
+ * a named domain.
+ * 
+ * Each domain maintains its own
+ * - categories
+ * - thread range stacks
+ * - registered strings
+ *
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying 
+ * a domain unregisters and destroys all objects associated with it such as 
+ * registered strings, resource objects, named categories, and started ranges. 
+ *
+ * \section RESOURCE_NAMING Resource Naming
+ *
+ * This section covers calls that allow to annotate objects with user-provided
+ * names in order to allow for a better analysis of complex trace data. All of
+ * the functions take the handle or the ID of the object to name and the name.
+ * The functions can be called multiple times during the execution of an
+ * application, however, in that case it is implementation dependent which
+ * name will be reported by the tool.
+ * 
+ * \subsection CATEGORY_NAMING Category Naming
+ *
+ * Some function in this library support associating an integer category 
+ * to enable filtering and sorting.  The category naming functions allow 
+ * the application to associate a user friendly name with the integer 
+ * category.  Support for domains have been added in NVTX_VERSION_2 to 
+ * avoid collisions when domains are developed independantly. 
+ *
+ * \subsection RESOURCE_OBJECTS Resource Objects
+ *
+ * Resource objects are a generic mechanism for attaching data to an application 
+ * resource.  The identifier field makes the association to a pointer or handle, 
+ * while the type field helps provide deeper understanding of the identifier as 
+ * well as enabling differentiation in cases where handles generated by different
+ * APIs may collide.  The resource object may also have an associated message to
+ * associate with the application resource, enabling further annotation of this 
+ * object and how it is used.
+ * 
+ * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
+ * functions and allow the application resource identified by those functions to be
+ * associated to a domain.  The other naming functions are still supported for backward
+ * compatibility but will be associated only to the default domain.
+ *
+ * \subsection RESOURCE_NAMING_OS Resource Naming
+ * 
+ * Some operating system resources creation APIs do not support providing a user friendly 
+ * name, such as some OS thread creation APIs.  This API support resource naming though 
+ * both through resource objects and functions following the pattern 
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2 
+ * supersede the other functions with a a more general method of assigning names to OS resources,
+ * along with associating them to domains too.  The older nvtxName* functions are only associated 
+ * with the default domain.
+ * \section EXTENSIONS Optional Extensions
+ * Optional extensions will either appear within the existing sections the extend or appear 
+ * in the "Related Pages" when they introduce new concepts.
+ */
+
+ /**
+ * Tools Extension API version
+ */
+#if defined(NVTX_VERSION) && NVTX_VERSION < 3
+#error "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
+#endif
+
+/* Header guard */
+#if !defined(NVTX_VERSION)
+#define NVTX_VERSION 3
+
+#if defined(_MSC_VER)
+#define NVTX_API __stdcall
+#define NVTX_INLINE_STATIC __inline static
+#else /*defined(__GNUC__)*/
+#define NVTX_API
+#define NVTX_INLINE_STATIC inline static
+#endif /* Platform */
+
+#if defined(NVTX_NO_IMPL)
+/* When omitting implementation, avoid declaring functions inline */
+/* without definitions, since this causes compiler warnings. */
+#define NVTX_DECLSPEC
+#elif defined(NVTX_EXPORT_API)
+/* Allow overriding definition of NVTX_DECLSPEC when exporting API. */
+/* Default is empty, meaning non-inline with external linkage. */
+#if !defined(NVTX_DECLSPEC)
+#define NVTX_DECLSPEC
+#endif
+#else
+/* Normal NVTX usage defines the NVTX API inline with static */
+/* (internal) linkage. */
+#define NVTX_DECLSPEC NVTX_INLINE_STATIC
+#endif
+
+#include "nvtxDetail/nvtxLinkOnce.h"
+
+#define NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) NAME##_v##VERSION
+#define NVTX_VERSIONED_IDENTIFIER_L2(NAME, VERSION) NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION)
+#define NVTX_VERSIONED_IDENTIFIER(NAME) NVTX_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION)
+
+/**
+ * The nvToolsExt library depends on stdint.h.  If the build tool chain in use
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
+ * and define the following types:
+ * <ul>
+ *   <li>uint8_t
+ *   <li>int8_t
+ *   <li>uint16_t
+ *   <li>int16_t
+ *   <li>uint32_t
+ *   <li>int32_t
+ *   <li>uint64_t
+ *   <li>int64_t
+ *   <li>uintptr_t
+ *   <li>intptr_t
+ * </ul>
+ * #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
+ */
+#ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
+#include <stdint.h>
+#endif
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** 
+* Result Codes
+*/
+
+#define NVTX_SUCCESS 0
+#define NVTX_FAIL 1
+#define NVTX_ERR_INIT_LOAD_PROPERTY 2
+#define NVTX_ERR_INIT_ACCESS_LIBRARY 3
+#define NVTX_ERR_INIT_LOAD_LIBRARY 4
+#define NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT 5
+#define NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT 6
+#define NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE 7
+
+/**
+ * Size of the nvtxEventAttributes_t structure.
+ */
+#define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) )
+
+#define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
+
+typedef uint64_t nvtxRangeId_t;
+
+/* Forward declaration of opaque domain registration structure */
+struct nvtxDomainRegistration_st;
+typedef struct nvtxDomainRegistration_st nvtxDomainRegistration;
+
+/* \brief Domain Handle Structure.
+* \anchor DOMAIN_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a domain.  This type is returned from tools when using the NVTX API to
+* create a domain.
+*
+*/
+typedef nvtxDomainRegistration* nvtxDomainHandle_t;
+
+/* Forward declaration of opaque string registration structure */
+struct nvtxStringRegistration_st;
+typedef struct nvtxStringRegistration_st nvtxStringRegistration;
+
+/* \brief Registered String Handle Structure.
+* \anchor REGISTERED_STRING_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a registered string.  This type is returned from tools when using the NVTX
+* API to create a registered string.
+*
+*/
+typedef nvtxStringRegistration* nvtxStringHandle_t;
+
+/* ========================================================================= */
+/** \defgroup GENERAL General
+ * @{
+ */
+
+/** ---------------------------------------------------------------------------
+ * Color Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxColorType_t
+{
+    NVTX_COLOR_UNKNOWN  = 0,                 /**< Color attribute is unused. */
+    NVTX_COLOR_ARGB     = 1                  /**< An ARGB color is provided. */
+} nvtxColorType_t;
+
+/** ---------------------------------------------------------------------------
+ * Message Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxMessageType_t
+{
+    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message payload is unused. */
+    NVTX_MESSAGE_TYPE_ASCII       = 1,    /**< A character sequence is used as payload. */
+    NVTX_MESSAGE_TYPE_UNICODE     = 2,     /**< A wide character sequence is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_MESSAGE_TYPE_REGISTERED  = 3,    /**< A unique string handle that was registered
+                                                with \ref nvtxDomainRegisterStringA() or 
+                                                \ref nvtxDomainRegisterStringW(). */
+} nvtxMessageType_t;
+
+typedef union nvtxMessageValue_t
+{
+    const char* ascii;
+    const wchar_t* unicode;
+    /* NVTX_VERSION_2 */
+    nvtxStringHandle_t registered;
+} nvtxMessageValue_t;
+
+
+/** @} */ /*END defgroup*/
+/* ------------------------------------------------------------------------- */
+/** \brief Force initialization (optional)
+*
+* Force NVTX library to initialize.  The first call to any NVTX API function
+* will automatically initialize the entire API.  This can make the first call
+* much slower than subsequent calls.  In applications where the first call to
+* NVTX may be in a performance-critical section, calling nvtxInitialize before
+* any performance-critical sections will ensure NVTX initialization occurs at
+* an acceptable time.  Since nvtxInitialize takes no parameters and has no
+* expected behavior besides initialization, it is convenient to add a call to
+* nvtxInitialize in NVTX-instrumented applications that need to force earlier
+* initialization without changing any other code.  For example, if an app's
+* first NVTX call is nvtxDomainCreate, and it is difficult to move that call
+* earlier because the domain handle must be stored in an object only created
+* at that point, adding a call to nvtxInitialize at the top of main() will
+* ensure the later call to nvtxDomainCreate is as fast as possible.
+*
+* \version \NVTX_VERSION_3
+*
+* \param reserved - must be zero or NULL.
+*
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
+/** @} */
+
+
+/** @} */ /*END defgroup*/
+
+/* ========================================================================= */
+/** \defgroup EVENT_ATTRIBUTES Event Attributes
+* @{
+*/
+
+/** ---------------------------------------------------------------------------
+* Payload Types
+* ------------------------------------------------------------------------- */
+typedef enum nvtxPayloadType_t
+{
+    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1,   /**< A 64 bit unsigned integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT64 = 2,   /**< A 64 bit signed integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_DOUBLE = 3,   /**< A 64 bit floating point value is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT32 = 5,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_FLOAT = 6    /**< A 32 bit floating point value is used as payload. */
+} nvtxPayloadType_t;
+
+/** \brief Event Attribute Structure.
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
+ *
+ * This structure is used to describe the attributes of an event. The layout of
+ * the structure is defined by a specific version of the tools extension
+ * library and can change between different versions of the Tools Extension
+ * library.
+ *
+ * \par Initializing the Attributes
+ *
+ * The caller should always perform the following three tasks when using
+ * attributes:
+ * <ul>
+ *    <li>Zero the structure
+ *    <li>Set the version field
+ *    <li>Set the size field
+ * </ul>
+ *
+ * Zeroing the structure sets all the event attributes types and values
+ * to the default value.
+ *
+ * The version and size field are used by the Tools Extension
+ * implementation to handle multiple versions of the attributes structure.
+ *
+ * It is recommended that the caller use one of the following to methods
+ * to initialize the event attributes structure:
+ *
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * \endcode
+ *
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = 1;
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
+ * \endcode
+ *
+ * If the caller uses Method 1 it is critical that the entire binary
+ * layout of the structure be configured to 0 so that all fields
+ * are initialized to the default value.
+ *
+ * The caller should either use both NVTX_VERSION and
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+ * and a versioned type (Method 2).  Using a mix of the two methods
+ * will likely cause either source level incompatibility or binary
+ * incompatibility in the future.
+ *
+ * \par Settings Attribute Types and Values
+ *
+ *
+ * \par Example:
+ * \code
+ * // Initialize
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ *
+ * // Configure the Attributes
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example";
+ * \endcode
+ *
+ * In the example the caller does not have to set the value of
+ * \ref ::nvtxEventAttributes_v2::category or
+ * \ref ::nvtxEventAttributes_v2::payload as these fields were set to
+ * the default value by {0}.
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxDomainRangePushEx
+ */
+typedef struct nvtxEventAttributes_v2
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+
+    /**
+     * \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0
+     */
+    uint32_t category;
+
+    /** \brief Color type specified in this attribute structure.
+     *
+     * Defines the color format of the attribute structure's \ref COLOR_FIELD
+     * "color" field.
+     *
+     * Default Value is NVTX_COLOR_UNKNOWN
+     */
+    int32_t colorType;              /* nvtxColorType_t */
+
+    /** \brief Color assigned to this event. \anchor COLOR_FIELD
+     *
+     * The color that the tool should use to visualize the event.
+     */
+    uint32_t color;
+
+    /**
+     * \brief Payload type specified in this attribute structure.
+     *
+     * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
+     * "payload" field.
+     *
+     * Default Value is NVTX_PAYLOAD_UNKNOWN
+     */
+    int32_t payloadType;            /* nvtxPayloadType_t */
+
+    int32_t reserved0;
+
+    /**
+     * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
+     *
+     * A numerical value that can be used to annotate an event. The tool could
+     * use the payload data to reconstruct graphs and diagrams.
+     */
+    union payload_t
+    {
+        uint64_t ullValue;
+        int64_t llValue;
+        double dValue;
+        /* NVTX_VERSION_2 */
+        uint32_t uiValue;
+        int32_t iValue;
+        float fValue;
+    } payload;
+
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is NVTX_MESSAGE_UNKNOWN
+     */
+    int32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    nvtxMessageValue_t message;
+
+} nvtxEventAttributes_v2;
+
+typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
+
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup MARKERS_AND_RANGES Markers and Ranges
+ *
+ * See \ref MARKERS_AND_RANGES for more details
+ *
+ * @{
+ */
+
+/** \name Marker */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+*
+* A marker can contain a text message or specify additional information
+* using the event attributes structure.  These attributes include a text
+* message, color, category, and a payload. Each of the attributes is optional
+* and can only be sent out using the \ref nvtxDomainMarkEx function.
+*
+* nvtxDomainMarkEx(NULL, event) is equivalent to calling
+* nvtxMarkEx(event).
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the marker's
+* attribute types and attribute values.
+*
+* \sa
+* ::nvtxMarkEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker can contain a text message or specify additional information
+ * using the event attributes structure.  These attributes include a text
+ * message, color, category, and a payload. Each of the attributes is optional
+ * and can only be sent out using the \ref nvtxMarkEx function.
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker
+ * or if an attribute is unspecified then a default value will be used.
+ *
+ * \param eventAttrib - The event attribute structure defining the marker's
+ * attribute types and attribute values.
+ *
+ * \par Example:
+ * \code
+ * // zero the structure
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * // set the version and the size information
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * // configure the attributes.  0 is the default for all attributes.
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
+ * nvtxMarkEx(&eventAttrib);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
+ * text message.
+ *
+ * \param message     - The message associated to this marker event.
+ *
+ * \par Example:
+ * \code
+ * nvtxMarkA("Example nvtxMarkA");
+ * nvtxMarkW(L"Example nvtxMarkW");
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxMarkEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
+/** @} */
+
+
+/** \name Process Ranges */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range in a domain.
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The unique ID used to correlate a pair of Start and End events.
+*
+* \remarks Ranges defined by Start/End can overlap.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeEnd
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.category = 3;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF0088FF;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example Range";
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
+ * // ...
+ * nvtxRangeEnd(rangeId);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
+ * nvtxRangeEnd(r1);
+ * nvtxRangeEnd(r2);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxRangeStartEx
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+*
+* \param domain - The domain 
+* \param id - The correlation ID returned from a nvtxRangeStart call.
+*
+* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. 
+* It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeStartEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+ *
+ * \param id - The correlation ID returned from an nvtxRangeStart call.
+ *
+ * \sa
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxRangeStartEx
+ * ::nvtxRangeStartA
+ * ::nvtxRangeStartW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
+/** @} */
+
+/** \name Thread Ranges */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+*
+* \param domain    - The domain of scoping.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The 0 based level of range being started. This value is scoped to the domain.
+* If an error occurs, a negative value is returned.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.colorType = NVTX_COLOR_ARGB;
+* eventAttrib.color = 0xFFFF0000;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "Level 0";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* // Re-use eventAttrib
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+* eventAttrib.message.unicode = L"Level 1";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* nvtxDomainRangePop(domain); //level 1
+* nvtxDomainRangePop(domain); //level 0
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangePop
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The 0 based level of range being started. This level is per domain.
+ * If an error occurs a negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFFFF0000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Level 0";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * // Re-use eventAttrib
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+ * eventAttrib.message.unicode = L"Level 1";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The 0 based level of range being started.  If an error occurs a
+ * negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
+/** @} */
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+*
+* \return The level of the range being ended. If an error occurs a negative
+* value is returned on the current thread.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreate("example library");
+* nvtxDomainRangePushA(domain, "Level 0");
+* nvtxDomainRangePushW(domain, L"Level 1");
+* nvtxDomainRangePop(domain);
+* nvtxDomainRangePop(domain);
+* \endcode
+*
+* \sa
+* ::nvtxRangePushEx
+* ::nvtxRangePushA
+* ::nvtxRangePushW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+ *
+ * \return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangePushEx
+ * ::nvtxRangePushA
+ * ::nvtxRangePushW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
+/** @} */
+
+
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup RESOURCE_NAMING Resource Naming
+ *
+ * See \ref RESOURCE_NAMING for more details
+ *
+ * @{
+ */
+
+
+/*  ------------------------------------------------------------------------- */
+/** \name Functions for Generic Resource Naming*/
+/*  ------------------------------------------------------------------------- */
+
+/*  ------------------------------------------------------------------------- */
+/** \cond SHOW_HIDDEN
+* \brief Resource typing helpers.  
+*
+* Classes are used to make it easy to create a series of resource types 
+* per API without collisions 
+*/
+#define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
+#define NVTX_RESOURCE_CLASS_GENERIC 1
+/** \endcond */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Generic resource type for when a resource class is not available.
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+*/
+typedef enum nvtxResourceGenericType_t
+{
+    NVTX_RESOURCE_TYPE_UNKNOWN = 0,
+    NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */
+    NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */
+} nvtxResourceGenericType_t;
+
+
+
+/** \brief Resource Attribute Structure.
+* \anchor RESOURCE_ATTRIBUTE_STRUCTURE
+*
+* This structure is used to describe the attributes of a resource. The layout of
+* the structure is defined by a specific version of the tools extension
+* library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the resource attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxEventAttributes for a specific version
+* \code
+* nvtxResourceAttributes_v0 attribs = {0};
+* attribs.version = 2;
+* attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+*
+* // Initialize
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+*
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*/
+typedef struct nvtxResourceAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of this attribute
+    * structure.
+    */
+    uint16_t size;
+
+    /**
+    * \brief Identifier type specifies how to interpret the identifier field
+    *
+    * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD
+    * "identifier" field.
+    *
+    * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN
+    */
+    int32_t identifierType;            /* values from enums following the pattern nvtxResource[name]Type_t */
+
+    /**
+    * \brief Identifier for the resource. 
+    * \anchor RESOURCE_IDENTIFIER_FIELD
+    *
+    * An identifier may be a pointer or a handle to an OS or middleware API object.
+    * The resource type will assist in avoiding collisions where handles values may collide.
+    */
+    union identifier_t
+    {
+        const void* pValue;
+        uint64_t ullValue;
+    } identifier;
+
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD
+    *
+    * The text message that is attached to a resource.
+    */
+    nvtxMessageValue_t message;
+
+} nvtxResourceAttributes_v0;
+
+typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
+
+/* \cond SHOW_HIDDEN 
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
+typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
+/** \endcond */
+
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Create a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+* 
+*
+* \param domain - Domain to own the resource object
+* \param attribs - Attributes to be associated with the resource
+*
+* \return A handle that represents the newly created resource object.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxResourceAttributes_t
+* ::nvtxDomainResourceDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+*
+* \param resource - Handle to the resource in which to operate.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* nvtxDomainResourceDestroy(handle);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource);
+/** @} */
+
+
+/** \name Functions for NVTX Category Naming*/
+
+/* ------------------------------------------------------------------------- */
+/**
+* \brief Annotate an NVTX category used within a domain.
+*
+* Categories are used to group sets of events. Each category is identified
+* through a unique ID and that ID is passed into any of the marker/range
+* events to assign that event to a specific category. The nvtxDomainNameCategory
+* function calls allow the user to assign a name to a category ID that is
+* specific to the domain.
+*
+* nvtxDomainNameCategory(NULL, category, name) is equivalent to calling
+* nvtxNameCategory(category, name).
+*
+* \param domain    - The domain of scoping the category.
+* \param category  - The category ID to name.
+* \param name      - The name of the category.
+*
+* \remarks The category names are tracked per domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example");
+* nvtxDomainNameCategoryA(domain, 1, "Memory Allocation");
+* nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer");
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+/** @} */
+
+/** \brief Annotate an NVTX category.
+ *
+ * Categories are used to group sets of events. Each category is identified
+ * through a unique ID and that ID is passed into any of the marker/range
+ * events to assign that event to a specific category. The nvtxNameCategory
+ * function calls allow the user to assign a name to a category ID.
+ *
+ * \param category - The category ID to name.
+ * \param name     - The name of the category.
+ *
+ * \remarks The category names are tracked per process.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameCategory(1, "Memory Allocation");
+ * nvtxNameCategory(2, "Memory Transfer");
+ * nvtxNameCategory(3, "Memory Object Lifetime");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
+/** @} */
+
+/** \name Functions for OS Threads Naming*/
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotate an OS thread.
+ *
+ * Allows the user to name an active thread of the current process. If an
+ * invalid thread ID is provided or a thread ID from a different process is
+ * used the behavior of the tool is implementation dependent.
+ *
+ * Tools expect thread ID to be a number that uniquely identifies the thread
+ * at the time of the call. Note that a thread's ID can be reused after
+ * it is destroyed. Tools may choose how to handle aliasing of thread IDs.
+ *
+ * POSIX pthread_t type returned by pthread_self() may not comply with these
+ * expectations. Please use OS-specific thread ID instead of pthread_t.
+ *
+ * The thread name is associated to the default domain.  To support domains 
+ * use resource objects via ::nvtxDomainResourceCreate.
+ *
+ * \param threadId - The ID of the thread to name.
+ * \param name     - The name of the thread.
+ *
+ * \par Examples:
+ * MS Windows:
+ * \code
+ * #include <windows.h>
+ * nvtxNameOsThread(GetCurrentThreadId(), "Current thread");
+ * nvtxNameOsThread(GetThreadId(SomeThreadHandle), "Other thread");
+ * \endcode
+ *
+ * Android:
+ * \code
+ * #include <unistd.h>
+ * nvtxNameOsThreadA(gettid(), "Current thread");
+ * nvtxNameOsThreadA(getpid(), "Main thread");
+ * \endcode
+ *
+ * Linux:
+ * \code
+ * #include <sys/syscall.h>
+ * nvtxNameOsThreadA(syscall(SYS_gettid), "Current thread");
+ * \endcode
+ * \code
+ * #include <unistd.h>
+ * nvtxNameOsThreadA(getpid(), "Main thread");
+ * \endcode
+ *
+ * OS X:
+ * \code
+ * #include <sys/syscall.h>
+ * nvtxNameOsThreadA(syscall(SYS_thread_selfid), "Current thread");
+ * \endcode
+ * \code
+ * #include <pthread.h>
+ * __uint64_t id;
+ * pthread_threadid_np(pthread_self(), &id);
+ * nvtxNameOsThreadA(id, "Current thread");
+ * pthread_threadid_np(somePThreadId, &id);
+ * nvtxNameOsThreadA(id, "Other thread");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
+/** @} */
+
+
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup STRING_REGISTRATION String Registration
+*
+* Registered strings are intended to increase performance by lowering instrumentation
+* overhead.  String may be registered once and the handle may be passed in place of
+* a string where an the APIs may allow.
+*
+* See \ref STRING_REGISTRATION for more details
+*
+* @{
+*/
+
+/* ------------------------------------------------------------------------- */
+/** \brief Register a string.
+
+* Registers an immutable string with NVTX. Once registered the pointer used
+* to register the domain name can be used in nvtxEventAttributes_t
+* \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the
+* contents of the message on each event invocation.
+*
+* String registration is an optimization. It is recommended to use string
+* registration if the string will be passed to an event many times.
+*
+* String are not unregistered, except that by unregistering the entire domain
+*
+* \param domain  - Domain handle. If NULL then the global domain is used.
+* \param string    - A unique pointer to a sequence of characters.
+*
+* \return A handle representing the registered string.
+*
+* \par Example:
+* \code
+* nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+* eventAttrib.message.registered = message;
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string);
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string);
+/** @} */
+
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup DOMAINS Domains
+*
+* Domains are used to group events to a developer defined scope. Middleware
+* vendors may also scope their own events to avoid collisions with the
+* the application developer's events, so that the application developer may
+* inspect both parts and easily differentiate or filter them.  By default
+* all events are scoped to a global domain where NULL is provided or when
+* using APIs provided b versions of NVTX below v2
+*
+* Domains are intended to be typically long lived objects with the intention
+* of logically separating events of large modules from each other such as
+* middleware libraries from each other and the main application.
+*
+* See \ref DOMAINS for more details
+*
+* @{
+*/
+
+/* ------------------------------------------------------------------------- */
+/** \brief Register a NVTX domain.
+*
+* Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1
+* annotations are scoped to the global domain. The function nvtxDomainCreate
+* creates a new named domain.
+*
+* Each domain maintains its own nvtxRangePush and nvtxRangePop stack.
+*
+* \param name - A unique string representing the domain.
+*
+* \return A handle representing the domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+*
+* nvtxMarkA("nvtxMarkA to global domain");
+*
+* nvtxEventAttributes_t eventAttrib1 = {0};
+* eventAttrib1.version = NVTX_VERSION;
+* eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain";
+* nvtxDomainMarkEx(NULL, &eventAttrib1);
+*
+* nvtxEventAttributes_t eventAttrib2 = {0};
+* eventAttrib2.version = NVTX_VERSION;
+* eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example";
+* nvtxDomainMarkEx(domain, &eventAttrib2);
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name);
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Unregister a NVTX domain.
+*
+* Unregisters the domain handle and frees all domain specific resources.
+*
+* \param domain    - the domain handle
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainCreateA
+* ::nvtxDomainCreateW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
+/** @} */
+
+
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \cond SHOW_HIDDEN */
+
+#ifdef UNICODE
+    #define nvtxMark            nvtxMarkW
+    #define nvtxRangeStart      nvtxRangeStartW
+    #define nvtxRangePush       nvtxRangePushW
+    #define nvtxNameCategory    nvtxNameCategoryW
+    #define nvtxNameOsThread    nvtxNameOsThreadW
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateW
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringW
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryW
+#else
+    #define nvtxMark            nvtxMarkA
+    #define nvtxRangeStart      nvtxRangeStartA
+    #define nvtxRangePush       nvtxRangePushA
+    #define nvtxNameCategory    nvtxNameCategoryA
+    #define nvtxNameOsThread    nvtxNameOsThreadA
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateA
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringA
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryA
+#endif
+
+/** \endcond */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
+
+#include "nvtxDetail/nvtxTypes.h"
+
+#ifndef NVTX_NO_IMPL
+#include "nvtxDetail/nvtxImpl.h"
+#endif /*NVTX_NO_IMPL*/
+
+#undef NVTX_IMPL_GUARD
+
+#endif /* !defined(NVTX_VERSION) */
diff --git a/src/include/nvtx3/nvToolsExtCuda.h b/src/include/nvtx3/nvToolsExtCuda.h
new file mode 100644
index 0000000000..b1e654c746
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtCuda.h
@@ -0,0 +1,141 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include "cuda.h"
+
+#ifndef NVTOOLSEXT_CUDA_V3
+#define NVTOOLSEXT_CUDA_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDA  4
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDA
+*/
+typedef enum nvtxResourceCUDAType_t
+{
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
+} nvtxResourceCUDAType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCuda_v3.h"
+#undef NVTX_IMPL_GUARD_CUDA
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_CUDA_V3 */
diff --git a/src/include/nvtx3/nvToolsExtCudaRt.h b/src/include/nvtx3/nvToolsExtCudaRt.h
new file mode 100644
index 0000000000..002f6e9975
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtCudaRt.h
@@ -0,0 +1,117 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include "cuda.h"
+#include "driver_types.h"
+
+#ifndef NVTOOLSEXT_CUDART_V3
+#define NVTOOLSEXT_CUDART_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDART 5
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDART
+*/
+typedef enum nvtxResourceCUDARTType_t
+{
+    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
+} nvtxResourceCUDARTType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The id of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceW
+  #define nvtxNameCudaStream nvtxNameCudaStreamW
+  #define nvtxNameCudaEvent  nvtxNameCudaEventW
+#else
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceA
+  #define nvtxNameCudaStream nvtxNameCudaStreamA
+  #define nvtxNameCudaEvent  nvtxNameCudaEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCudaRt_v3.h"
+#undef NVTX_IMPL_GUARD_CUDART
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_CUDART_V3 */
diff --git a/src/include/nvtx3/nvToolsExtOpenCL.h b/src/include/nvtx3/nvToolsExtOpenCL.h
new file mode 100644
index 0000000000..611c0cb07f
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtOpenCL.h
@@ -0,0 +1,191 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#include <CL/cl.h>
+
+#ifndef NVTOOLSEXT_OPENCL_V3
+#define NVTOOLSEXT_OPENCL_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6 
+/** \endcond */
+
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
+} nvtxResourceOpenCLType_t;
+
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplOpenCL_v3.h"
+#undef NVTX_IMPL_GUARD_OPENCL
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_OPENCL_V3 */
diff --git a/src/include/nvtx3/nvToolsExtSync.h b/src/include/nvtx3/nvToolsExtSync.h
new file mode 100644
index 0000000000..5d2472962d
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtSync.h
@@ -0,0 +1,382 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#ifndef NVTOOLSEXT_SYNC_V3
+#define NVTOOLSEXT_SYNC_V3
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* \cond SHOW_HIDDEN 
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
+/** \endcond */
+
+
+/** 
+* \page PAGE_SYNCHRONIZATION Synchronization
+*
+* This section covers a subset of the API that allow users to track additional
+* synchronization details of their application.   Naming OS synchronization primitives 
+* may allow users to better understand the data collected by traced synchronization 
+* APIs.  Additionally, a user defined synchronization object can allow the users to
+* to tell the tools when the user is building their own synchronization system
+* that do not rely on the OS to provide behaviors and instead use techniques like
+* atomic operations and spinlocks.  
+*
+* See module \ref SYNCHRONIZATION for details.
+*
+* \par Example:
+* \code
+* class MyMutex
+* {
+*     volatile long bLocked;
+*     nvtxSyncUser_t hSync;
+* public:
+*     MyMutex(const char* name, nvtxDomainHandle_t d){
+*          bLocked = 0;
+*
+*          nvtxSyncUserAttributes_t attribs = { 0 };
+*          attribs.version = NVTX_VERSION;
+*          attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*          attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+*          attribs.message.ascii = name;
+*          hSync = nvtxDomainSyncUserCreate(d, &attribs);
+*     }
+*
+*     ~MyMutex() {
+*          nvtxDomainSyncUserDestroy(hSync);
+*     }
+*
+*     bool Lock() {
+*          nvtxDomainSyncUserAcquireStart(hSync);
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic 
+
+*          if (acquired) {
+*              nvtxDomainSyncUserAcquireSuccess(hSync);
+*          }
+*          else {
+*              nvtxDomainSyncUserAcquireFailed(hSync);
+*          }
+*          return acquired;
+*     }
+
+*     void Unlock() {
+*          nvtxDomainSyncUserReleasing(hSync);
+*          bLocked = false;
+*     }
+* };
+* \endcode
+* 
+* \version \NVTX_VERSION_2
+*/
+
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN 
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
+#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
+/** \endcond */
+
+
+/*  ------------------------------------------------------------------------- */
+/** \defgroup SYNCHRONIZATION Synchronization
+* See page \ref PAGE_SYNCHRONIZATION.
+* @{
+*/
+
+/** \brief Resource type values for OSs with POSIX Thread API support
+ */
+typedef enum nvtxResourceSyncPosixThreadType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t  */
+} nvtxResourceSyncPosixThreadType_t;
+
+/** \brief Resource type values for Windows OSs
+*/
+typedef enum nvtxResourceSyncWindowsType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
+} nvtxResourceSyncWindowsType_t;
+
+/** \brief Resource type values for Linux and Linux derived OSs such as Android
+* \sa
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
+} nvtxResourceSyncLinuxType_t;
+
+/** \brief Resource type values for Android come from Linux.
+* \sa
+* ::nvtxResourceSyncLinuxType_t
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
+
+/** \brief User Defined Synchronization Object Handle .
+* \anchor SYNCUSER_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a user defined syncrhonization object.  The tools will return a pointer through the API for the application
+* to hold on it's behalf to reference the string in the future.
+*
+*/
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+
+/** \brief User Defined Synchronization Object Attributes Structure.
+* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
+*
+* This structure is used to describe the attributes of a user defined synchronization 
+* object.  The layout of the structure is defined by a specific version of the tools 
+* extension library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the event attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = 1;
+* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* // Initialize
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Example";
+* \endcode
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+*/
+typedef struct nvtxSyncUserAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure.
+    *
+    * The text message that is attached to an event.
+    */
+    nvtxMessageValue_t message;
+
+} nvtxSyncUserAttributes_v0;
+
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+
+/* ------------------------------------------------------------------------- */
+/** \brief Create a user defined synchronization object 
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param domain - Domain to own the resource
+* \param attribs - A structure to assign multiple attributes to the object.
+*
+* \return A handle that represents the newly created user defined synchronization object.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of failure in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart
+* 
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of success in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
+
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of releasing a reservation on user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
+
+
+/** @} */ /*END defgroup*/
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplSync_v3.h"
+#undef NVTX_IMPL_GUARD_SYNC
+#endif /*NVTX_NO_IMPL*/
+
+#endif /* NVTOOLSEXT_SYNC_V3 */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImpl.h b/src/include/nvtx3/nvtxDetail/nvtxImpl.h
new file mode 100644
index 0000000000..be27f4394a
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImpl.h
@@ -0,0 +1,438 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32) 
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h> 
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#define NVTX_INIT_STATE_FRESH 0
+#define NVTX_INIT_STATE_STARTED 1
+#define NVTX_INIT_STATE_COMPLETE 2
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+
+/* ---- Forward declare all functions referenced in globals ---- */
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
+    uint32_t version);
+NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
+    uint32_t exportTableId);
+
+#include "nvtxInitDecls.h"
+
+/* ---- Define all globals ---- */
+
+typedef struct nvtxGlobals_t
+{
+    volatile unsigned int initState;
+    NvtxExportTableCallbacks etblCallbacks;
+    NvtxExportTableVersionInfo etblVersionInfo;
+
+    /* Implementation function pointers */
+    nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
+    nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
+    nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
+    nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
+    nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
+    nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
+    nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
+    nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
+    nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
+    nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
+    nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
+    nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
+    nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
+    nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
+    nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
+
+    nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
+    nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
+    nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
+    nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
+    nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
+    nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
+    nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
+    nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
+
+    nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
+    nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
+    nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
+    nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
+    nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
+    nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
+    nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
+    nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
+    nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
+    nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
+    nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
+    nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
+    nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
+    nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
+
+    nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
+    nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
+    nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
+    nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
+    nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
+    nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
+
+    nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
+    nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
+    nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
+    nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
+    nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
+    nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
+    nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
+    nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
+    nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
+    nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
+    nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
+    nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
+    nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
+    nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
+    nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
+
+    nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
+    nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
+    nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
+
+    /* Tables of function pointers -- Extra null added to the end to ensure
+    *  a crash instead of silent corruption if a tool reads off the end. */
+    NvtxFunctionPointer* functionTable_CORE  [NVTX_CBID_CORE_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_CUDA  [NVTX_CBID_CUDA_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE  + 1];
+    NvtxFunctionPointer* functionTable_SYNC  [NVTX_CBID_SYNC_SIZE   + 1];
+} nvtxGlobals_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
+{
+    NVTX_INIT_STATE_FRESH,
+
+    {
+        sizeof(NvtxExportTableCallbacks),
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
+    },
+    {
+        sizeof(NvtxExportTableVersionInfo),
+        NVTX_VERSION,
+        0,
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
+    },
+
+    /* Implementation function pointers */
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
+
+    /* Tables of function pointers */
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
+        0
+    }
+};
+
+/* ---- Define static inline implementations of core API functions ---- */
+
+#include "nvtxImplCore.h"
+
+/* ---- Define implementations of export table functions ---- */
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size)
+{
+    unsigned int bytes = 0;
+    NvtxFunctionTable table = (NvtxFunctionTable)0;
+
+    switch (module)
+    {
+    case NVTX_CB_MODULE_CORE:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
+        break;
+    case NVTX_CB_MODULE_CUDA:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
+        break;
+    case NVTX_CB_MODULE_OPENCL:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
+        break;
+    case NVTX_CB_MODULE_CUDART:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
+        break;
+    case NVTX_CB_MODULE_CORE2:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
+        break;
+    case NVTX_CB_MODULE_SYNC:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
+        break;
+    default: return 0;
+    }
+
+    if (out_size)
+        *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
+
+    if (out_table)
+        *out_table = table;
+
+    return 1;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
+{
+    switch (exportTableId)
+    {
+    case NVTX_ETID_CALLBACKS:       return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
+    case NVTX_ETID_VERSIONINFO:     return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
+    default:                        return 0;
+    }
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
+{
+    /* Reserved for custom implementations to resolve problems with tools */
+    (void)version;
+}
+
+/* ---- Define implementations of init versions of all API functions ---- */
+
+#include "nvtxInitDefs.h"
+
+/* ---- Define implementations of initialization functions ---- */
+
+#include "nvtxInit.h"
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImplCore.h b/src/include/nvtx3/nvtxDetail/nvtxImplCore.h
new file mode 100644
index 0000000000..9f014ca7f5
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCore.h
@@ -0,0 +1,307 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
+    if(local!=0)
+        (*local)(eventAttrib);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
+    if(local!=0)
+        (*local)(message);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
+    if(local!=0)
+        (*local)(message);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
+    if(local!=0)
+        (*local)(id);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
+{
+#ifndef NVTX_DISABLE
+    nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
+    if(local!=0)
+        return (*local)();
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
+    if(local!=0)
+        (*local)(category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
+    if(local!=0)
+        (*local)(category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
+    if(local!=0)
+        (*local)(threadId, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
+    if(local!=0)
+        (*local)(threadId, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, eventAttrib);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxRangeId_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, id);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, eventAttrib);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (int)NVTX_NO_PUSH_POP_TRACKING;
+}
+
+NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, attribs);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxResourceHandle_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(resource);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
+    if(local!=0)
+        (*local)(domain, category, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, string);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxStringHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, string);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxStringHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxDomainHandle_t)0;
+}
+
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
+    if(local!=0)
+        return (*local)(message);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxDomainHandle_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(domain);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
+{
+#ifndef NVTX_DISABLE
+    nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
+    if(local!=0)
+        (*local)(reserved);
+#endif /*NVTX_DISABLE*/
+}
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h b/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
new file mode 100644
index 0000000000..d4c0cdf7eb
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
@@ -0,0 +1,81 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_CUDART
+#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
+typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h b/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
new file mode 100644
index 0000000000..4b5d6c7b42
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
@@ -0,0 +1,102 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_CUDA
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
+    if(local!=0)
+        (*local)(stream, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(event, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h b/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
new file mode 100644
index 0000000000..4a026f024a
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
@@ -0,0 +1,161 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_OPENCL
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
+typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
+typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
+typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
+typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
+typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
+typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
+    if(local!=0)
+        (*local)(device, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
+    if(local!=0)
+        (*local)(context, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
+    if(local!=0)
+        (*local)(command_queue, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
+    if(local!=0)
+        (*local)(command_queue, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
+    if(local!=0)
+        (*local)(memobj, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
+    if(local!=0)
+        (*local)(memobj, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
+    if(local!=0)
+        (*local)(sampler, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
+    if(local!=0)
+        (*local)(sampler, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
+    if(local!=0)
+        (*local)(program, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
+    if(local!=0)
+        (*local)(program, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
+    if(local!=0)
+        (*local)(evnt, name);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
+{
+#ifndef NVTX_DISABLE
+    nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
+    if(local!=0)
+        (*local)(evnt, name);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h b/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
new file mode 100644
index 0000000000..90616da322
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
@@ -0,0 +1,83 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD_SYNC
+#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
+
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
+    if(local!=0)
+        return (*local)(domain, attribs);
+    else
+#endif  /*NVTX_DISABLE*/
+        return (nvtxSyncUser_t)0;
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
+{
+#ifndef NVTX_DISABLE
+    nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
+    if(local!=0)
+        (*local)(handle);
+#endif /*NVTX_DISABLE*/
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxInit.h b/src/include/nvtx3/nvtxDetail/nvtxInit.h
new file mode 100644
index 0000000000..44dcc0ff7a
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxInit.h
@@ -0,0 +1,312 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/* ---- Platform-independent helper definitions and functions ---- */
+
+/* Prefer macros over inline functions to reduce symbol resolution at link time */
+
+#if defined(_WIN32) 
+#define NVTX_PATHCHAR   wchar_t
+#define NVTX_STR(x)     L##x
+#define NVTX_GETENV     _wgetenv
+#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_DLLHANDLE  HMODULE
+#define NVTX_DLLOPEN(x) LoadLibraryW(x)
+#define NVTX_DLLFUNC    GetProcAddress
+#define NVTX_DLLCLOSE   FreeLibrary
+#define NVTX_YIELD()    SwitchToThread()
+#define NVTX_MEMBAR()   MemoryBarrier()
+#define NVTX_ATOMIC_WRITE_32(address, value)                        InterlockedExchange((volatile LONG*)address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
+#elif defined(__GNUC__)
+#define NVTX_PATHCHAR   char
+#define NVTX_STR(x)     x
+#define NVTX_GETENV     getenv
+#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_DLLHANDLE  void*
+#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
+#define NVTX_DLLFUNC    dlsym
+#define NVTX_DLLCLOSE   dlclose
+#define NVTX_YIELD()    sched_yield()
+#define NVTX_MEMBAR()   __sync_synchronize()
+/* Ensure full memory barrier for atomics, to match Windows functions */
+#define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#else
+#error The library does not support your configuration!
+#endif
+
+/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
+#if defined(_WIN32)
+/* TODO */
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#else
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#endif
+
+/* Define this to 1 for platforms that support environment variables */
+/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
+/* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
+#define NVTX_SUPPORT_ENV_VARS 1
+
+/* Define this to 1 for platforms that support dynamic/shared libraries */
+#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
+
+/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
+*  and this will override any dynamic injection.  Useful for platforms where dynamic
+*  injection is not available.  Since weak symbols not explicitly marked extern are
+*  guaranteed to be initialized to zero if no definitions are found by the linker, the
+*  dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
+#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
+/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
+*  symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
+*  does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
+*  injection library. */
+__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
+#else
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
+#endif
+
+/* This function tries to find or load an NVTX injection library and get the
+*  address of its InitializeInjection2 function.  If such a function pointer
+*  is found, it is called, and passed the address of this NVTX instance's
+*  nvtxGetExportTable function, so the injection can attach to this instance.
+*  If the initialization fails for any reason, any dynamic library loaded will
+*  be freed, and all NVTX implementation functions will be set to no-ops.  If
+*  initialization succeeds, NVTX functions not attached to the tool will be set
+*  to no-ops.  This is implemented as one function instead of several small
+*  functions to minimize the number of weak symbols the linker must resolve.
+*  Order of search is:
+*  - Pre-injected library exporting InitializeInjectionNvtx2
+*  - Loadable library exporting InitializeInjectionNvtx2
+*      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+*      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+*  - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+*/
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
+{
+    const char* const initFuncName = "InitializeInjectionNvtx2";
+    NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
+    NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
+    int entryPointStatus = 0;
+
+#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
+    /* Use POSIX global symbol chain to query for init function from any module */
+    init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
+#endif
+
+#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
+    /* Try discovering dynamic injection library to load */
+    if (!init_fnptr)
+    {
+#if NVTX_SUPPORT_ENV_VARS
+        /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
+        *  to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+        const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
+            ? NVTX_STR("NVTX_INJECTION32_PATH")
+            : NVTX_STR("NVTX_INJECTION64_PATH");
+#endif /* NVTX_SUPPORT_ENV_VARS */
+        NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
+        const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
+
+        /* Refer to this variable explicitly in case all references to it are #if'ed out */
+        (void)injectionLibraryPathBuf;
+
+#if NVTX_SUPPORT_ENV_VARS
+        /* Disable the warning for getenv & _wgetenv -- this usage is safe because
+        *  these functions are not called again before using the returned value. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif
+        injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+#endif
+
+#if defined(__ANDROID__)
+        if (!injectionLibraryPath)
+        {
+            const char *bits = (sizeof(void*) == 4) ? "32" : "64";
+            char cmdlineBuf[32];
+            char pkgName[PATH_MAX];
+            int count;
+            int pid;
+            FILE *fp;
+            size_t bytesRead;
+            size_t pos;
+
+            pid = (int)getpid();
+            count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
+            if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
+            {
+                NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            fp = fopen(cmdlineBuf, "r");
+            if (!fp)
+            {
+                NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
+            fclose(fp);
+            if (bytesRead == 0)
+            {
+                NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            pkgName[bytesRead] = 0;
+
+            /* String can contain colon as a process separator. In this case the package name is before the colon. */
+            pos = 0;
+            while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
+            {
+                ++pos;
+            }
+            pkgName[pos] = 0;
+
+            count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
+            if (count <= 0 || count >= NVTX_BUFSIZE)
+            {
+                NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            /* On Android, verify path is accessible due to aggressive file access restrictions. */
+            /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
+            /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
+            if (injectionLibraryPathBuf[0] == '/')
+            {
+#if (__ANDROID_API__ < 21)
+                int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
+#else
+                int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
+#endif
+                if (access_err != 0)
+                {
+                    NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
+                    return NVTX_ERR_INIT_ACCESS_LIBRARY;
+                }
+            }
+            injectionLibraryPath = injectionLibraryPathBuf;
+        }
+#endif
+
+        /* At this point, injectionLibraryPath is specified if a dynamic
+        *  injection library was specified by a tool. */
+        if (injectionLibraryPath)
+        {
+            /* Load the injection library */
+            injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
+            if (!injectionLibraryHandle)
+            {
+                NVTX_ERR("Failed to load injection library\n");
+                return NVTX_ERR_INIT_LOAD_LIBRARY;
+            }
+            else
+            {
+                /* Attempt to get the injection library's entry-point */
+                init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
+                if (!init_fnptr)
+                {
+                    NVTX_DLLCLOSE(injectionLibraryHandle);
+                    NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
+                    return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
+                }
+            }
+        }
+    }
+#endif
+
+#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
+    if (!init_fnptr)
+    {
+        /* Check weakly-defined function pointer.  A statically-linked injection can define this as
+        *  a normal symbol and it will take precedence over a dynamic injection. */
+        if (InitializeInjectionNvtx2_fnptr)
+        {
+            init_fnptr = InitializeInjectionNvtx2_fnptr;
+        }
+    }
+#endif
+
+    /* At this point, if init_fnptr is not set, then no tool has specified
+    *  an NVTX injection library -- return non-success result so all NVTX
+    *  API functions will be set to no-ops. */
+    if (!init_fnptr)
+    {
+        return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
+    }
+
+    /* Invoke injection library's initialization function.  If it returns
+    *  0 (failure) and a dynamic injection was loaded, unload it. */
+    entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
+    if (entryPointStatus == 0)
+    {
+        NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
+        if (injectionLibraryHandle)
+        {
+            NVTX_DLLCLOSE(injectionLibraryHandle);
+        }
+        return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
+    }
+
+    return NVTX_SUCCESS;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
+{
+    unsigned int old;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
+    {
+        return;
+    }
+
+    NVTX_ATOMIC_CAS_32(
+        old,
+        &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
+        NVTX_INIT_STATE_STARTED,
+        NVTX_INIT_STATE_FRESH);
+    if (old == NVTX_INIT_STATE_FRESH)
+    {
+        int result;
+        int forceAllToNoops;
+
+        /* Load & initialize injection library -- it will assign the function pointers */
+        result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
+
+        /* Set all pointers not assigned by the injection to null */
+        forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
+        NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
+
+        /* Signal that initialization has finished, so now the assigned function pointers will be used */
+        NVTX_ATOMIC_WRITE_32(
+            &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
+            NVTX_INIT_STATE_COMPLETE);
+    }
+    else /* Spin-wait until initialization has finished */
+    {
+        NVTX_MEMBAR();
+        while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
+        {
+            NVTX_YIELD();
+            NVTX_MEMBAR();
+        }
+    }
+}
diff --git a/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h b/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
new file mode 100644
index 0000000000..261681b792
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
@@ -0,0 +1,81 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
+
+NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
diff --git a/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h b/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
new file mode 100644
index 0000000000..ded156c7a1
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
@@ -0,0 +1,573 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxMarkW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangeStartW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxRangeEnd(id);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushEx(eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePushW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxRangePop();
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameCategoryA(category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameCategoryW(category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameOsThreadA(threadId, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxNameOsThreadW(threadId, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainMarkEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangeStartEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainRangeEnd(domain, id);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangePushEx(domain, eventAttrib);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRangePop(domain);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainResourceCreate(domain, attribs);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainResourceDestroy(resource);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainNameCategoryA(domain, category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainNameCategoryW(domain, category, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRegisterStringA(domain, string);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainRegisterStringW(domain, string);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainCreateA(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    return nvtxDomainCreateW(message);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxDomainDestroy(domain);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    nvtxInitialize(reserved);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
+    nvtxNameCuDeviceA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
+    nvtxNameCuDeviceW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
+    nvtxNameCuContextA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
+    nvtxNameCuContextW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
+    nvtxNameCuStreamA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
+    nvtxNameCuStreamW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
+    nvtxNameCuEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
+    nvtxNameCuEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
+    nvtxNameCudaDeviceA_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
+    nvtxNameCudaDeviceW_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
+    nvtxNameCudaStreamA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
+    nvtxNameCudaStreamW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
+    if (local)
+        local(stream, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
+    nvtxNameCudaEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
+    nvtxNameCudaEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+    if (local)
+        local(event, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
+    nvtxNameClDeviceA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
+    nvtxNameClDeviceW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
+    if (local)
+        local(device, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
+    nvtxNameClContextA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
+    nvtxNameClContextW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
+    if (local)
+        local(context, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
+    nvtxNameClCommandQueueA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
+    if (local)
+        local(command_queue, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
+    nvtxNameClCommandQueueW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
+    if (local)
+        local(command_queue, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
+    nvtxNameClMemObjectA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
+    if (local)
+        local(memobj, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
+    nvtxNameClMemObjectW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
+    if (local)
+        local(memobj, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
+    nvtxNameClSamplerA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
+    if (local)
+        local(sampler, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
+    nvtxNameClSamplerW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
+    if (local)
+        local(sampler, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
+    nvtxNameClProgramA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
+    if (local)
+        local(program, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
+    nvtxNameClProgramW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
+    if (local)
+        local(program, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
+    nvtxNameClEventA_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
+    if (local)
+        local(evnt, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
+    nvtxNameClEventW_fakeimpl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
+    if (local)
+        local(evnt, name);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
+    nvtxDomainSyncUserCreate_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
+    if (local) {
+        return local(domain, attribs);
+    }
+    return (nvtxSyncUser_t)0;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserDestroy_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireStart_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireFailed_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
+    nvtxDomainSyncUserReleasing_impl_fntype local;
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
+    local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
+    if (local)
+        local(handle);
+}
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
+{
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
+
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
+    if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
+        NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
+}
diff --git a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
new file mode 100644
index 0000000000..908ce88b10
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
@@ -0,0 +1,83 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef __NVTX_LINKONCE_H__
+#define __NVTX_LINKONCE_H__
+
+/* This header defines macros to permit making definitions of global variables
+ * and functions in C/C++ header files which may be included multiple times in
+ * a translation unit or linkage unit.  It allows authoring header-only libraries
+ * which can be used by multiple other header-only libraries (either as the same
+ * copy or multiple copies), and does not require any build changes, such as
+ * adding another .c file, linking a static library, or deploying a dynamic
+ * library.  Globals defined with these macros have the property that they have
+ * the same address, pointing to a single instance, for the entire linkage unit.
+ * It is expected but not guaranteed that each linkage unit will have a separate
+ * instance.
+ *
+ * In some situations it is desirable to declare a variable without initializing
+ * it, refer to it in code or other variables' initializers, and then initialize
+ * it later.  Similarly, functions can be prototyped, have their address taken,
+ * and then have their body defined later.  In such cases, use the FWDDECL macros 
+ * when forward-declaring LINKONCE global variables without initializers and
+ * function prototypes, and then use the DEFINE macros when later defining them.
+ * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
+ * following this pattern makes code maximally portable.
+ */
+
+#if defined(__MINGW32__) /* MinGW */
+    #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#elif defined(_MSC_VER) /* MSVC */
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   extern "C" __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION __inline
+    #endif
+#elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
+    #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#elif defined(__CYGWIN__) /* Assume GCC or compatible */
+    #define NVTX_LINKONCE_WEAK __attribute__((weak))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   __declspec(selectany)
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
+    #endif
+#else /* All others: Assume GCC, clang, or compatible */
+    #define NVTX_LINKONCE_WEAK   __attribute__((weak))
+    #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
+    #if defined(__cplusplus)
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
+    #else
+        #define NVTX_LINKONCE_DEFINE_GLOBAL   NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+        #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
+    #endif
+#endif
+
+#define NVTX_LINKONCE_FWDDECL_GLOBAL   NVTX_LINKONCE_DEFINE_GLOBAL   extern
+#define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
+
+#endif /* __NVTX_LINKONCE_H__ */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxTypes.h b/src/include/nvtx3/nvtxDetail/nvtxTypes.h
new file mode 100644
index 0000000000..53c6c00727
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxTypes.h
@@ -0,0 +1,304 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/* This header defines types which are used by the internal implementation
+*  of NVTX and callback subscribers.  API clients do not use these types,
+*  so they are defined here instead of in nvToolsExt.h to clarify they are
+*  not part of the NVTX client API. */
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h.
+#endif
+
+/* ------ Dependency-free types binary-compatible with real types ------- */
+
+/* In order to avoid having the NVTX core API headers depend on non-NVTX
+*  headers like cuda.h, NVTX defines binary-compatible types to use for
+*  safely making the initialization versions of all NVTX functions without
+*  needing to have definitions for the real types. */
+
+typedef int   nvtx_CUdevice;
+typedef void* nvtx_CUcontext;
+typedef void* nvtx_CUstream;
+typedef void* nvtx_CUevent;
+
+typedef void* nvtx_cudaStream_t;
+typedef void* nvtx_cudaEvent_t;
+
+typedef void* nvtx_cl_platform_id;
+typedef void* nvtx_cl_device_id;
+typedef void* nvtx_cl_context;
+typedef void* nvtx_cl_command_queue;
+typedef void* nvtx_cl_mem;
+typedef void* nvtx_cl_program;
+typedef void* nvtx_cl_kernel;
+typedef void* nvtx_cl_event;
+typedef void* nvtx_cl_sampler;
+
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+struct nvtxSyncUserAttributes_v0;
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+
+/* --------- Types for function pointers (with fake API types) ---------- */
+
+typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message);
+typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message);
+typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message);
+typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id);
+typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
+typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message);
+typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message);
+typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void);
+typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name);
+typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name);
+typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */
+typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */
+typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name);
+typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name);
+typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name);
+typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name);
+typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name);
+typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name);
+typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name);
+typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name);
+
+/* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */
+typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
+typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name);
+
+typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain);
+typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource);
+typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string);
+typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string);
+typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message);
+typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message);
+typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain);
+typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved);
+
+typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
+typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
+
+/* ---------------- Types for callback subscription --------------------- */
+
+typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId);
+typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable);
+
+typedef enum NvtxCallbackModule
+{
+    NVTX_CB_MODULE_INVALID                 = 0,
+    NVTX_CB_MODULE_CORE                    = 1,
+    NVTX_CB_MODULE_CUDA                    = 2,
+    NVTX_CB_MODULE_OPENCL                  = 3,
+    NVTX_CB_MODULE_CUDART                  = 4,
+    NVTX_CB_MODULE_CORE2                   = 5,
+    NVTX_CB_MODULE_SYNC                    = 6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CB_MODULE_SIZE,
+    NVTX_CB_MODULE_FORCE_INT               = 0x7fffffff
+} NvtxCallbackModule;
+
+typedef enum NvtxCallbackIdCore
+{
+    NVTX_CBID_CORE_INVALID                 =  0,
+    NVTX_CBID_CORE_MarkEx                  =  1,
+    NVTX_CBID_CORE_MarkA                   =  2,
+    NVTX_CBID_CORE_MarkW                   =  3,
+    NVTX_CBID_CORE_RangeStartEx            =  4,
+    NVTX_CBID_CORE_RangeStartA             =  5,
+    NVTX_CBID_CORE_RangeStartW             =  6,
+    NVTX_CBID_CORE_RangeEnd                =  7,
+    NVTX_CBID_CORE_RangePushEx             =  8,
+    NVTX_CBID_CORE_RangePushA              =  9,
+    NVTX_CBID_CORE_RangePushW              = 10,
+    NVTX_CBID_CORE_RangePop                = 11,
+    NVTX_CBID_CORE_NameCategoryA           = 12,
+    NVTX_CBID_CORE_NameCategoryW           = 13,
+    NVTX_CBID_CORE_NameOsThreadA           = 14,
+    NVTX_CBID_CORE_NameOsThreadW           = 15,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CORE_SIZE,
+    NVTX_CBID_CORE_FORCE_INT = 0x7fffffff
+} NvtxCallbackIdCore;
+
+typedef enum NvtxCallbackIdCore2
+{
+    NVTX_CBID_CORE2_INVALID                 = 0,
+    NVTX_CBID_CORE2_DomainMarkEx            = 1,
+    NVTX_CBID_CORE2_DomainRangeStartEx      = 2,
+    NVTX_CBID_CORE2_DomainRangeEnd          = 3,
+    NVTX_CBID_CORE2_DomainRangePushEx       = 4,
+    NVTX_CBID_CORE2_DomainRangePop          = 5,
+    NVTX_CBID_CORE2_DomainResourceCreate    = 6,
+    NVTX_CBID_CORE2_DomainResourceDestroy   = 7,
+    NVTX_CBID_CORE2_DomainNameCategoryA     = 8,
+    NVTX_CBID_CORE2_DomainNameCategoryW     = 9,
+    NVTX_CBID_CORE2_DomainRegisterStringA   = 10,
+    NVTX_CBID_CORE2_DomainRegisterStringW   = 11,
+    NVTX_CBID_CORE2_DomainCreateA           = 12,
+    NVTX_CBID_CORE2_DomainCreateW           = 13,
+    NVTX_CBID_CORE2_DomainDestroy           = 14,
+    NVTX_CBID_CORE2_Initialize              = 15,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CORE2_SIZE,
+    NVTX_CBID_CORE2_FORCE_INT               = 0x7fffffff
+} NvtxCallbackIdCore2;
+
+typedef enum NvtxCallbackIdCuda
+{
+    NVTX_CBID_CUDA_INVALID                 =  0,
+    NVTX_CBID_CUDA_NameCuDeviceA           =  1,
+    NVTX_CBID_CUDA_NameCuDeviceW           =  2,
+    NVTX_CBID_CUDA_NameCuContextA          =  3,
+    NVTX_CBID_CUDA_NameCuContextW          =  4,
+    NVTX_CBID_CUDA_NameCuStreamA           =  5,
+    NVTX_CBID_CUDA_NameCuStreamW           =  6,
+    NVTX_CBID_CUDA_NameCuEventA            =  7,
+    NVTX_CBID_CUDA_NameCuEventW            =  8,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CUDA_SIZE,
+    NVTX_CBID_CUDA_FORCE_INT               = 0x7fffffff
+} NvtxCallbackIdCuda;
+
+typedef enum NvtxCallbackIdCudaRt
+{
+    NVTX_CBID_CUDART_INVALID               =  0,
+    NVTX_CBID_CUDART_NameCudaDeviceA       =  1,
+    NVTX_CBID_CUDART_NameCudaDeviceW       =  2,
+    NVTX_CBID_CUDART_NameCudaStreamA       =  3,
+    NVTX_CBID_CUDART_NameCudaStreamW       =  4,
+    NVTX_CBID_CUDART_NameCudaEventA        =  5,
+    NVTX_CBID_CUDART_NameCudaEventW        =  6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_CUDART_SIZE,
+    NVTX_CBID_CUDART_FORCE_INT             = 0x7fffffff
+} NvtxCallbackIdCudaRt;
+
+typedef enum NvtxCallbackIdOpenCL
+{
+    NVTX_CBID_OPENCL_INVALID               =  0,
+    NVTX_CBID_OPENCL_NameClDeviceA         =  1,
+    NVTX_CBID_OPENCL_NameClDeviceW         =  2,
+    NVTX_CBID_OPENCL_NameClContextA        =  3,
+    NVTX_CBID_OPENCL_NameClContextW        =  4,
+    NVTX_CBID_OPENCL_NameClCommandQueueA   =  5,
+    NVTX_CBID_OPENCL_NameClCommandQueueW   =  6,
+    NVTX_CBID_OPENCL_NameClMemObjectA      =  7,
+    NVTX_CBID_OPENCL_NameClMemObjectW      =  8,
+    NVTX_CBID_OPENCL_NameClSamplerA        =  9,
+    NVTX_CBID_OPENCL_NameClSamplerW        = 10,
+    NVTX_CBID_OPENCL_NameClProgramA        = 11,
+    NVTX_CBID_OPENCL_NameClProgramW        = 12,
+    NVTX_CBID_OPENCL_NameClEventA          = 13,
+    NVTX_CBID_OPENCL_NameClEventW          = 14,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_OPENCL_SIZE,
+    NVTX_CBID_OPENCL_FORCE_INT             = 0x7fffffff
+} NvtxCallbackIdOpenCL;
+
+typedef enum NvtxCallbackIdSync
+{
+    NVTX_CBID_SYNC_INVALID                      = 0,
+    NVTX_CBID_SYNC_DomainSyncUserCreate         = 1,
+    NVTX_CBID_SYNC_DomainSyncUserDestroy        = 2,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireStart   = 3,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireFailed  = 4,
+    NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5,
+    NVTX_CBID_SYNC_DomainSyncUserReleasing      = 6,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_CBID_SYNC_SIZE,
+    NVTX_CBID_SYNC_FORCE_INT                    = 0x7fffffff
+} NvtxCallbackIdSync;
+
+/* IDs for NVTX Export Tables */
+typedef enum NvtxExportTableID
+{
+    NVTX_ETID_INVALID                      = 0,
+    NVTX_ETID_CALLBACKS                    = 1,
+    NVTX_ETID_RESERVED0                    = 2,
+    NVTX_ETID_VERSIONINFO                  = 3,
+    /* --- New constants must only be added directly above this line --- */
+    NVTX_ETID_SIZE,
+    NVTX_ETID_FORCE_INT                    = 0x7fffffff
+} NvtxExportTableID;
+
+typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */
+typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */
+
+typedef struct NvtxExportTableCallbacks
+{
+    size_t struct_size;
+
+    /* returns an array of pointer to function pointers*/
+    int (NVTX_API *GetModuleFunctionTable)(
+        NvtxCallbackModule module,
+        NvtxFunctionTable* out_table,
+        unsigned int* out_size);
+} NvtxExportTableCallbacks;
+
+typedef struct NvtxExportTableVersionInfo
+{
+    /* sizeof(NvtxExportTableVersionInfo) */
+    size_t struct_size;
+
+    /* The API version comes from the NVTX library linked to the app.  The
+    * injection library is can use this info to make some assumptions */
+    uint32_t version;
+
+    /* Reserved for alignment, do not use */
+    uint32_t reserved0;
+
+    /* This must be set by tools when attaching to provide applications
+    *  the ability to, in emergency situations, detect problematic tools
+    *  versions and modify the NVTX source to prevent attaching anything
+    *  that causes trouble in the app.  Currently, this value is ignored. */
+    void (NVTX_API *SetInjectionNvtxVersion)(
+        uint32_t version);
+} NvtxExportTableVersionInfo;
+
+
+
+
+
+
+
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 9d3730e0cd..756c8d289b 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -10,23 +10,34 @@
 #define NCCL_P2P_H_
 
 struct ncclP2Pinfo {
- const void* sendbuff;
-  void* recvbuff;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-};
-
-struct ncclP2PConnect {
-  int nrecv[MAXCHANNELS];
-  int nsend[MAXCHANNELS];
-  int* recv;
-  int* send;
+  void* buff;
+  ssize_t nbytes;
+  struct ncclP2Pinfo* next;
 };
 
 struct ncclP2Plist {
-  struct ncclP2Pinfo *peerlist;
-  int count;
-  struct ncclP2PConnect connect;
+  struct ncclP2Pinfo *head;
+  struct ncclP2Pinfo *tail;
 };
 
+static ncclResult_t enqueueP2pInfo(ncclP2Plist* p2p, void* buff, ssize_t nBytes) {
+  if (p2p == NULL) return ncclInternalError;
+  struct ncclP2Pinfo* next;
+  NCCLCHECK(ncclCalloc(&next, 1));
+  next->buff = buff;
+  next->nbytes = nBytes;
+  if (p2p->tail != NULL) p2p->tail->next = next;
+  p2p->tail = next;
+  if (p2p->head == NULL) p2p->head = next;
+  return ncclSuccess;
+}
+
+static ncclResult_t dequeueP2pInfo(ncclP2Plist* p2p) {
+  if (p2p == NULL) return ncclInternalError;
+  struct ncclP2Pinfo* temp = p2p->head;
+  p2p->head = p2p->head->next;
+  if (p2p->tail == temp) p2p->tail = NULL;
+  free(temp);
+  return ncclSuccess;
+}
 #endif
diff --git a/src/include/param.h b/src/include/param.h
index 54317571e7..e4c11df2da 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -31,10 +31,11 @@ static void setEnvFile(const char* fileName) {
     int s=0; // Env Var Size
     while (line[s] != '\0' && line[s] != '=') s++;
     if (line[s] == '\0') continue;
-    strncpy(envVar, line, std::min(1024,s));
+    strncpy(envVar, line, std::min(1023,s));
     envVar[s] = '\0';
     s++;
-    strncpy(envValue, line+s, 1024);
+    strncpy(envValue, line+s, 1023);
+    envValue[1023]='\0';
     setenv(envVar, envValue, 0);
   }
   if (line) free(line);
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 04daa844b1..9796baf39a 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -18,18 +18,23 @@ struct ncclProxyArgs {
   proxyProgressFunc_t progress;
   struct ncclChannel* channel;
   struct ncclConnector* connector;
+  size_t sendbytes;
+  size_t recvbytes;
   int sliceSteps;
   int chunkSteps;
   int nsteps;
   uint64_t opCount;
   int protocol;
+  int segment; // Only for profiling
   ncclDataType_t dtype;
   ncclRedOp_t redOp;
   int state;   // add component before this line -- it is left out during initialization
 
   // Internal state
-  uint64_t head;
-  uint64_t tail;
+  uint64_t posted;
+  uint64_t received; // Only used by recv proxy to wait for flush.
+  uint64_t transmitted;
+  uint64_t done;
   uint64_t end;
   void* requests[NCCL_STEPS];
   int idle;
@@ -38,14 +43,30 @@ struct ncclProxyArgs {
   pthread_mutex_t mutex;
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
+  struct ncclProxyArgs* nextGroup;
+  struct ncclProxyArgs** proxyAppendPtr;
+};
+
+struct ncclProxySharedBuffers {
+  int nslots;
+  int slotSize;
+  char* cudaBuff[2*MAXCHANNELS];
+  int* cudaUsed[2*MAXCHANNELS];
+  char* hostBuff[2*MAXCHANNELS];
+  int* hostUsed[2*MAXCHANNELS];
+  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
 };
 
 struct ncclProxyPool;
 struct ncclProxyState {
   pthread_cond_t cond;
-  pthread_mutex_t mutex;
+  pthread_mutex_t opsMutex;
+  pthread_mutex_t poolMutex;
   bool stop;
+  struct ncclProxySharedBuffers* sharedBuffs;
   struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* nextOps;
+  struct ncclProxyArgs* nextOpsEnd;
   struct ncclProxyArgs* pool;
   struct ncclProxyPool* pools;
 };
@@ -59,11 +80,16 @@ enum proxyMode {
 };
 
 ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
 
+ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
+ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
+ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
+ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
+
 #include <unistd.h>
 
 // Spin wait until func evaluates to true
diff --git a/src/include/socket.h b/src/include/socket.h
index 46b204db08..e903b04280 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,6 +21,7 @@
 #define SLEEP_INT            1000 // connection retry sleep interval in usec
 #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
 #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
+#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 
 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -64,7 +65,7 @@ static inline int envSocketFamily(void) {
 
 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
 #endif
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
@@ -167,9 +168,9 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
 
 static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
 #endif
-  char line_a[1024];
+  char line_a[SOCKET_NAME_MAXLEN+1];
   int found = 0;
   struct ifaddrs *interfaces, *interface;
   getifaddrs(&interfaces);
@@ -350,7 +351,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
   SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
 
 #ifdef ENABLE_TRACE
-  char line[1024];
+  char line[SOCKET_NAME_MAXLEN+1];
   TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
 
@@ -365,6 +366,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
 static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
   /* IPv4/IPv6 support */
   int family = remoteAddr->sa.sa_family;
+  if (family != AF_INET && family != AF_INET6) {
+    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
+    return ncclInternalError;
+  }
   int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
 
   /* Connect to a hostname / port */
@@ -381,10 +386,8 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
     SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
     SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
 
-  char line[1024];
-#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
   TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
-#endif
 
   int ret;
   int timedout_retries = 0;
@@ -445,7 +448,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
   return ncclSuccess;
 }
 
-static ncclResult_t socketReceive(int fd, void* ptr, int size) {
+static ncclResult_t socketRecv(int fd, void* ptr, int size) {
   int offset = 0;
   NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, ptr, size, &offset));
   return ncclSuccess;
diff --git a/src/include/transport.h b/src/include/transport.h
index 5a85688515..2ecc72748d 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -41,8 +41,8 @@ struct ncclConnect {
 };
 
 struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
-  ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
+  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
+  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
 };
@@ -54,6 +54,7 @@ struct ncclTransport {
   struct ncclTransportComm recv;
 };
 
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
 
 #endif
diff --git a/src/include/trees.h b/src/include/trees.h
index 7eadd8556e..ded84a6677 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_TREES_H_
 #define NCCL_TREES_H_
 
-ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
-ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
 
 #endif
diff --git a/src/init.cc b/src/init.cc
index 585db4b506..81831cf5dc 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -158,9 +158,11 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
 static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
-  free(comm->p2plist.peerlist);
-  free(comm->p2plist.connect.recv);
-  free(comm->p2plist.connect.send);
+  free(comm->connectSend);
+  free(comm->connectRecv);
+  free(comm->p2pSends);
+  free(comm->p2pRecvs);
+  free(comm->asyncOps);
 
   free(comm->peerInfo);
   ncclTopoFree(comm->topo);
@@ -191,7 +193,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
     free(comm->intraCGMode);
     free(comm->intraCC);
   }
-  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+  NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
 
   // Poison comm to try and catch a double free
   commPoison(comm);
@@ -218,7 +220,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
 
-  comm->rank = comm->hostDevComm.rank =rank;
+  comm->rank = comm->hostDevComm.rank = rank;
   comm->nRanks = comm->hostDevComm.nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
@@ -240,11 +242,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
   comm->argsptr = &comm->args;
   comm->collNetSupport = 0;
-  comm->p2plist.count=0;
-  NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
-  for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
-  NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
-  NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
+
+  NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
+  comm->asyncOpCount = 0;
+  comm->asyncTotalSize = 0;
+
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+  comm->p2pSendCount = comm->p2pRecvCount = 0;
+  NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
 
   // Mark channels as non initialized.
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
@@ -396,8 +406,8 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
 
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
-#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
-#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
+#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
+#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
 NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
 NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
 NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@@ -455,7 +465,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
   // setup
   struct ncclConnect myConnect;
   if (isMaster && ret > 0) {
-    NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
+    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
   }
   // prepare connect handles
   ncclResult_t res;
@@ -485,7 +495,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
   }
   // connect
   if (isMaster && ret > 0) {
-    NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
     struct ncclPeer* devRoot = channel->devPeers+nranks;
     struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
     CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup);
@@ -543,10 +553,9 @@ NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
-  // We use 3 AllGathers
-  // 1. { peerInfo, comm }
-  // 2. ConnectTransport[nranks], ConnectValue[nranks]
-  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+  // We use 2 AllGathers
+  // 1. { peerInfo, comm, compCap}
+  // 2. { nChannels, graphInfo, topoRanks }
 
   int rank = comm->rank;
   int nranks = comm->nRanks;
@@ -558,10 +567,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   struct {
     struct ncclPeerInfo peerInfo;
     struct ncclComm* comm;
+    int cudaCompCap;
   } *allGather1Data;
 
   NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
   allGather1Data[rank].comm = comm;
+  allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
   struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
   NCCLCHECK(fillInfo(comm, myInfo, commHash));
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
@@ -574,7 +585,42 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       return ncclInvalidUsage;
     }
   }
-  // AllGather1 data is used again below
+
+  // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  int myCompCap = allGather1Data[rank].cudaCompCap;
+  int minCompCap = myCompCap, maxCompCap = myCompCap;
+  uint64_t otherHostHash;
+  int tmpNnodes = 1;
+  for (int i = 0; i < nranks; i++) {
+    if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
+      if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
+        if (intraRanks == 0) intraRank0 = i;
+        if (i == rank) intraRank = intraRanks;
+        intraRanks++;
+      }
+    } else {  // Determine whether number of nodes is 2 (for use in tree pattern determination)
+      if (tmpNnodes == 1) {
+        otherHostHash = allGather1Data[i].peerInfo.hostHash;
+        tmpNnodes = 2;
+      } else if (tmpNnodes == 2 && otherHostHash != allGather1Data[i].peerInfo.hostHash) {
+        tmpNnodes = 3;
+      }
+    }
+    minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
+    maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
+  }
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm;
+
+  free(allGather1Data);
+
   // AllGather1 - end
 
   // Topo detection / System graph creation
@@ -603,7 +649,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   struct ncclTopoGraph treeGraph;
   treeGraph.id = 1;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.pattern = tmpNnodes <= 2 ? NCCL_TOPO_PATTERN_TREE : NCCL_TOPO_PATTERN_BALANCED_TREE;
   treeGraph.crossNic = ncclParamCrossNic();
   treeGraph.collNet = 0;
   treeGraph.minChannels = 1;
@@ -627,15 +673,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   // AllGather3 - begin
   struct ncclGraphInfo {
+    int pattern;
     int sameChannels;
     float speedIntra;
     float speedInter;
     int typeIntra;
+    int typeInter;
   };
 
   struct {
     int cudaCompCap;
-    int fullCudaCompCap;
     int nChannels;
     struct ncclGraphInfo tree;
     struct ncclGraphInfo ring;
@@ -644,29 +691,35 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   } *allGather3Data;
 
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
-  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
   allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
     std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
   allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
   allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
   allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
+  allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
+  allGather3Data[rank].ring.pattern = ringGraph.pattern;
   allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
   allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
   allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
   allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+  allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
+  allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
   allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
   allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
   allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
   allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
+  allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
 
   NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
 
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
 
   // Determine nNodes, firstRanks, ...
-  int* nodesFirstRank;
+  int *nodesFirstRank, *nodesTreePatterns;
   NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
+  NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
   for (int i=0; i<nranks; i++) {
     int node = -1;
     int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
@@ -676,18 +729,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     if (node == -1) {
       node = comm->nNodes++;
       nodesFirstRank[node] = firstRank;
+      // Record tree pattern of each node as they can be different depending on sm arch
+      nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
     }
     if (i == comm->rank) comm->node = node;
   }
 
-  // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = allGather3Data[rank].cudaCompCap;
-  int minCompCap = myCompCap, maxCompCap = myCompCap;
-  for (int i = 0; i < nranks; i++) {
-    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
-    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
-  }
-
   int nChannelsOrig = comm->nChannels;
   struct ncclTopoRanks** allTopoRanks;
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -699,14 +746,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
     treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
     treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+    treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
     ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
     ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
     ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
     collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
     collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
     collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
     collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+    collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
   }
 
   if (comm->nChannels < nChannelsOrig) {
@@ -718,7 +768,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int *rings;
   NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
 
-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings));
   if (comm->nNodes > 1 &&
       ncclParamCollNetEnable() == 1 &&
       collNetSupport() && collNetGraph.nChannels) {
@@ -726,6 +776,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
 
   free(allTopoRanks);
+  free(nodesTreePatterns);
   free(nodesFirstRank);
   free(allGather3Data);
 
@@ -733,16 +784,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
 
-  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
-
   char line[1024];
   line[0]='\0';
   for (int c=0; c<comm->nChannels; c++) {
-    struct ncclTree* treeUp = &comm->channels[c].treeUp;
-    struct ncclTree* treeDn = &comm->channels[c].treeDn;
-    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
-        c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
-        treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
+    struct ncclTree* tree = &comm->channels[c].tree;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+        c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
   }
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s", line);
@@ -757,16 +804,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(computeBuffSizes(comm));
 
   // Connect with prev/next for each ring
-  struct ncclConnect *connect;
-  NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
     if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
   }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore);
+  INFO(NCCL_INIT, "Connected all rings");
+
+  // Connect Trees
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    if (comm->nRanks == 1) continue;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore);
+  INFO(NCCL_INIT, "Connected all trees");
 
   // Check if we can setup CollNet
   if (comm->nNodes > 1 &&
@@ -779,8 +834,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     for (int c=0; c<logicChannels; c++) {
       struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
       struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
-      NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+      NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down));
+      NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up));
       const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
       const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
       if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
@@ -788,39 +843,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
         collNetSetupFail = 1;
     }
+    NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph));
     // Verify CollNet setup across ranks
     NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
   }
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
-  free(connect);
   free(rings);
 
+  // Compute time models for algorithm and protocol combinations
+  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
-  // Compute intra ranks (using AllGather1 data)
-  do {
-    int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-    for (int i = 0; i < nranks; i++) {
-      if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
-          (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
-        if (intraRanks == 0) intraRank0 = i;
-        if (i == rank) intraRank = intraRanks;
-        intraRanks++;
-      }
-    }
-    TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-    if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
-      WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-          rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-      return ncclInternalError;
-    }
-    NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
-  } while(0);
-
-  // Done with AllGather1 data
-  free(allGather1Data);
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
 
@@ -884,6 +920,7 @@ end:
 
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
   NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
@@ -892,6 +929,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 
 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
   if (ndev < 0) {
     WARN("Invalid device count requested : %d", ndev);
@@ -911,9 +949,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
 static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
-#ifdef ENABLE_TRACE
-  int rank = comm->rank;
-#endif
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
 
@@ -921,7 +956,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
     CUDACHECK(cudaSetDevice(commDevice));
   }
 
-  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
 
   CUDACHECK(cudaStreamSynchronize(comm->groupStream));
   NCCLCHECK(ncclProxyDestroy(comm));
@@ -930,13 +965,14 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   if (savedDevice != commDevice)
     CUDACHECK(cudaSetDevice(savedDevice));
 
-  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
 
   return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
 ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   if (comm == NULL)
     return ncclSuccess;
 
@@ -953,6 +989,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   if (comm == NULL)
     return ncclSuccess;
 
@@ -985,6 +1022,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
   NCCLCHECK(PtrCheck(count, "CommCount", "count"));
   *count = comm->nRanks;
@@ -993,6 +1031,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
 
 NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
   NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
   *devid = comm->cudaDev;
@@ -1001,6 +1040,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
 
 NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
 ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
   NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
   *rank = comm->rank;
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 27623b2533..c262f8cd5a 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -45,11 +45,11 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
   info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
     info->count = info->nBytes;
     info->datatype = ncclInt8;
   }
-  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
 
   if (info->op < 0 || info->op >= ncclNumOps) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
@@ -57,7 +57,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
 
   if (info->comm->checkPointers) {
-    if (info->coll == ncclCollSendRecv) {
+    if (info->coll == ncclFuncSendRecv) {
       if (strcmp(info->opName, "Send") == 0) {
         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
       } else {
@@ -65,10 +65,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
       }
     } else {
       // Check CUDA device pointers
-      if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
         NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
       }
-      if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      if (info->coll != ncclFuncReduce || info->comm->rank == info->root) {
         NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
       }
     }
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index 34ed0aa8c7..e83392dcd5 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,14 +16,11 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult);
-static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
 static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
 
 // Used to make the NVML library calls thread safe
@@ -74,10 +71,7 @@ ncclResult_t wrapNvmlSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -91,9 +85,6 @@ teardown:
   nvmlInternalShutdown = NULL;
   nvmlInternalDeviceGetHandleByPciBusId = NULL;
   nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceGetHandleByIndex = NULL;
-  nvmlInternalDeviceGetPciInfo = NULL;
-  nvmlInternalDeviceGetMinorNumber = NULL;
   nvmlInternalDeviceGetNvLinkState = NULL;
   nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
   nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -162,51 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  if (nvmlInternalDeviceGetPciInfo == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetPciInfo() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
-  if (nvmlInternalDeviceGetMinorNumber == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
   if (nvmlInternalDeviceGetNvLinkState == NULL) {
     /* Do not warn, this symbol is optional. */
diff --git a/src/proxy.cc b/src/proxy.cc
index 19dbcedc85..d3824f2346 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -6,10 +6,10 @@
 
 #include "comm.h"
 #include "info.h"
+#include "graph.h"
 #include "collectives.h"
 
-#define RECV 0
-#define SEND 1
+enum { proxyRecv=0, proxySend=1 };
 
 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -19,15 +19,13 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
   const int myrank = 0, nextrank = 1, prevrank = nranks-1;
   int index = pattern == ncclPatternPipelineFrom ?
       /*                            no recv /  no send    if root = */
-      /* bcast  */ (type == RECV ?   myrank : nextrank ):
-      /* reduce */ (type == RECV ? prevrank :   myrank );
+      /* bcast  */ (type == proxyRecv ?   myrank : nextrank ):
+      /* reduce */ (type == proxyRecv ? prevrank :   myrank );
   int rank = ring->userRanks[index];
   return (root != rank);
 }
 
-enum { proxyRecv=0, proxySend=1 };
-
-#define PROXYARGS_ALLOCATE_SIZE 32
+#define PROXYARGS_ALLOCATE_SIZE 128
 struct ncclProxyPool {
   struct ncclProxyPool *next;
   struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
@@ -36,7 +34,7 @@ struct ncclProxyPool {
 static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
   struct ncclProxyState* state = &comm->proxyState;
   struct ncclProxyArgs* elem;
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->poolMutex);
   if (state->pool == NULL) {
     // Allocate a new pool of elements
     struct ncclProxyPool* newPool;
@@ -54,39 +52,113 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
   }
   elem = state->pool;
   state->pool = state->pool->next;
-  pthread_mutex_unlock(&state->mutex);
-  elem->next = elem->nextPeer = NULL;
+  pthread_mutex_unlock(&state->poolMutex);
+  elem->next = elem->nextPeer = elem->nextGroup = NULL;
   *argsptr = elem;
   return ncclSuccess;
 }
 
-static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
-  struct ncclComm* comm = connector->comm;
-  struct ncclProxyState* state = &comm->proxyState;
-  pthread_mutex_lock(&state->mutex);
-  if (connector->proxyAppend == NULL) {
-    // Nothing running for that peer. Add to the circular list
-    if (state->ops == NULL) {
-      // Create the list
-      args->next = args;
-      state->ops = args;
-    } else {
-      // Insert element in the list
-      args->next = state->ops->next;
-      state->ops->next = args;
+//#define DEBUG_PROXY 1
+#ifdef DEBUG_PROXY
+#define DEBUG_PROXY_PRINT printf
+#else
+#define DEBUG_PROXY_PRINT(...)
+#endif
+
+#define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
+#define OP_SEEN 0x100000
+ncclResult_t dumpProxyState(struct ncclProxyState* state) {
+#ifdef DEBUG_PROXY
+  struct ncclProxyArgs* op = state->ops;
+  while (op) {
+    if (op->idle & OP_SEEN) {
+      WARN("Active list loop at element %ld\n", OP_INDEX(op));
+    }
+    op->idle |= OP_SEEN;
+    printf("[%ld]", OP_INDEX(op));
+    if (op->nextPeer) {
+      printf("(%ld)", OP_INDEX(op->nextPeer));
+      struct ncclProxyArgs* n = op->nextPeer;
+      n->idle |= OP_SEEN;
+      while (n->nextGroup || n->nextPeer) {
+        n = n->nextGroup ? n->nextGroup : n->nextPeer;
+        n->idle |= OP_SEEN;
+      }
+    }
+    if (op->nextGroup)  {
+      printf("--G->");
+      op = op->nextGroup;
+    } else {
+      printf("--N->");
+      op = op->next;
     }
-    connector->proxyAppend = args;
-  } else {
-    // There is an active operation already for that peer.
-    // Add it to the per-peer list
-    connector->proxyAppend->nextPeer = args;
-    connector->proxyAppend = args;
   }
-  pthread_mutex_unlock(&state->mutex);
+  printf("[X]\n");
+
+  struct ncclProxyArgs* free = state->pool;
+  while (free) {
+    if (free->idle & OP_SEEN) {
+      WARN("Free list loop at element %ld\n", OP_INDEX(free));
+    }
+    free->idle |= OP_SEEN;
+    free = free->next;
+  }
+
+  struct ncclProxyPool* p = state->pools;
+  int i = 0;
+  while (p) {
+    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
+      if ((p->elems[e].idle & OP_SEEN) == 0) {
+        WARN("Element %d of pool %d has been lost\n", e, i);
+        struct ncclProxyArgs* free = state->pool;
+        printf("Free list ");
+        while (free) {
+          printf("--> %ld ", OP_INDEX(free));
+          free = free->next;
+        }
+        printf("\n");
+        return ncclInternalError;
+      }
+      p->elems[e].idle -= OP_SEEN;
+    }
+    p = p->next;
+    i++;
+  }
+#endif
+  return ncclSuccess;
 }
 
-template <int type>
-static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) {
+  struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
+  if (proxyAppend) {
+    if (shared && proxyAppend->opCount == args->opCount) {
+      args->next = proxyAppend->next;
+      proxyAppend->next = NULL;
+      proxyAppend->nextGroup = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next));
+    } else {
+      proxyAppend->nextPeer = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld                  : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+    }
+  } else {
+    // Nothing running for that peer. Add to the list
+    if (state->ops == NULL) {
+      // Create the list
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as first element                            : \n", OP_INDEX(args), shared, args->opCount);
+      state->ops = args;
+    } else {
+      // Append element at the end of the list
+      struct ncclProxyArgs* last = state->ops;
+      while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next;
+      last->next = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element                             : \n", OP_INDEX(args),shared, args->opCount);
+    }
+  }
+  *(args->proxyAppendPtr) = args;
+  return ncclSuccess;
+}
+
+static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
   if (peer < 0) return ncclSuccess;
 
   struct ncclPeer* peerComm = args->channel->peers+peer;
@@ -98,69 +170,169 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
   }
   if (connector->transportComm->proxy == NULL) return ncclSuccess;
 
+  struct ncclProxyState* state = &connector->comm->proxyState;
   struct ncclProxyArgs* op;
   NCCLCHECK(allocateArgs(connector->comm, &op));
   memcpy(op, args, sizeof(struct ncclProxyArgs));
   op->connector = connector;
   op->progress = connector->transportComm->proxy;
   op->state = ncclProxyOpReady;
-  ProxyAppend(connector, op);
+
+  op->proxyAppendPtr =
+    connector->conn.shared ?
+    state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers
+    &connector->proxyAppend;  // Dedicated buffers
+
+  if (state->nextOps == NULL) state->nextOps = op;
+  else state->nextOpsEnd->next = op;
+  state->nextOpsEnd = op;
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
     struct ncclRing* ring = &args->channel->ring;
-    if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
-    if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+    if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args));
+    if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args));
   }
   if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
     // Tree up
-    struct ncclTree* tree = &args->channel->treeUp;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
-    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args));
+    NCCLCHECK(SaveProxy(proxySend, tree->up, args));
   }
   if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
     // Tree down
-    struct ncclTree* tree = &args->channel->treeDn;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args));
+    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
   }
   if (pattern == ncclPatternCollTreeUp) {
     // CollTree up
-    struct ncclTree* tree = &args->channel->collTreeUp;
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
-    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+    struct ncclTree* tree = &args->channel->collTree;
+    NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args));
+    NCCLCHECK(SaveProxy(proxySend, tree->up, args));
   }
   if (pattern == ncclPatternCollTreeDown) {
     // CollTree down
-    struct ncclTree* tree = &args->channel->collTreeDn;
-    NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
-    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+    struct ncclTree* tree = &args->channel->collTree;
+    NCCLCHECK(SaveProxy(proxySend, tree->down[0], args));
+    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args));
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) {
   struct ncclProxyArgs args;
   memset(&args, 0, sizeof(struct ncclProxyArgs));
   args.channel = channel;
   args.sliceSteps = 1;
   args.chunkSteps = 1;
   args.protocol = NCCL_PROTO_SIMPLE;
-  args.opCount = info->comm->opCount;
+  args.segment = segment;
+  args.opCount = channel->workFifoTail-1;
   args.dtype = info->datatype;
-  if (info->delta > 0 && info->sendbytes >= 0) {
-    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
-    args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
-    if (args.nsteps == 0) args.nsteps = 1;
-    NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
-  }
   if (info->delta > 0 && info->recvbytes >= 0) {
     int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
     args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
     if (args.nsteps == 0) args.nsteps = 1;
-    NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
+    args.recvbytes = info->recvbytes;
+    args.sendbytes = 0;
+    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, &args));
+  }
+  if (info->delta > 0 && info->sendbytes >= 0) {
+    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
+    args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
+    if (args.nsteps == 0) args.nsteps = 1;
+    args.sendbytes = info->sendbytes;
+    args.recvbytes = 0;
+    NCCLCHECK(SaveProxy(proxySend, peersend, &args));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) {
+  struct ncclProxyArgs* freeOp = *opPtr;
+  DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup));
+  if (*prevGroupPtr && *prevOpPtr) return ncclInternalError;
+  if (freeOp->nextGroup) {
+    // Part of a group : remove the element
+    struct ncclProxyArgs* next = freeOp->nextGroup;
+    *opPtr = next;
+    if (*prevGroupPtr) {
+      (*prevGroupPtr)->nextGroup = next;
+    } else if (*prevOpPtr) {
+      (*prevOpPtr)->next = next;
+    } else {
+      state->ops = next;
+    }
+  } else {
+    struct ncclProxyArgs* next = freeOp->next;
+    *opPtr = next;
+    if ((*prevGroupPtr)) {
+      (*prevGroupPtr)->next = next;
+      (*prevGroupPtr)->nextGroup = NULL;
+      (*prevGroupPtr)->nextPeer = freeOp->nextPeer;
+      if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr;
+      (*prevOpPtr) = *prevGroupPtr;
+      (*prevGroupPtr) = NULL;
+    } else {
+      if (freeOp->nextPeer) {
+        // replace op by nextPeer
+        struct ncclProxyArgs* nextPeer = freeOp->nextPeer;
+        if (*prevOpPtr) {
+          (*prevOpPtr)->next = nextPeer;
+        } else {
+          state->ops = nextPeer;
+        }
+        struct ncclProxyArgs* lastGroup = nextPeer;
+        while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup;
+        lastGroup->next = next;
+        *(prevOpPtr) = lastGroup;
+      } else {
+        *(freeOp->proxyAppendPtr) = NULL;
+        if (*prevOpPtr) {
+          (*prevOpPtr)->next = next;
+        } else {
+          state->ops = next;
+        }
+      }
+    }
+  }
+  pthread_mutex_lock(&state->poolMutex);
+  freeOp->next = state->pool;
+  state->pool = freeOp;
+  pthread_mutex_unlock(&state->poolMutex);
+  DEBUG_PROXY_PRINT("Removed %5ld (%5ld)                                               : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+  NCCLCHECK(dumpProxyState(state));
+  return ncclSuccess;
+}
+
+static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
+  struct ncclProxyArgs* prevOp = NULL;
+  struct ncclProxyArgs* prevGroup = NULL;
+  struct ncclProxyArgs* op = *opsPtr;
+  while (op) {
+    if (op->state == ncclProxyOpNone) return ncclInternalError;
+    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+    // yet and might be cancelled before they even start. Hold on on those.
+    if (op->opCount < comm->lastOpCount) {
+      NCCLCHECK(op->progress(op));
+      *idle &= op->idle;
+    }
+    if (op->state == ncclProxyOpNone) {
+      NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup));
+    } else {
+      if (op->nextGroup) {
+        prevGroup = op;
+        prevOp = NULL;
+        op = op->nextGroup;
+      } else {
+        prevOp = op;
+        prevGroup = NULL;
+        op = op->next;
+      }
+    }
   }
   return ncclSuccess;
 }
@@ -168,91 +340,170 @@ ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel
 void* persistentThread(void *comm_) {
   struct ncclComm* comm = (struct ncclComm*)comm_;
   struct ncclProxyState* state = &comm->proxyState;
-  struct ncclProxyArgs* op = NULL;
-  ncclResult_t ret = ncclSuccess;
-  int idle = 1;
-  int idleSpin = 0;
+  char threadName[16];
+  sprintf(threadName, "NCCLproxy %5d", comm->rank);
+  nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
+
+  pthread_mutex_lock(&state->opsMutex);
+  struct ncclProxyArgs** opsPtr = &state->ops;
   while (1) {
-    do {
-      if (*comm->abortFlag) return NULL;
-      if (op == NULL) {
-        pthread_mutex_lock(&state->mutex);
-        op = state->ops;
-        if (op == NULL) {
-          if (state->stop) {
-            // No more commands to process and proxy has been requested to stop
-            pthread_mutex_unlock(&state->mutex);
-            return NULL;
-          }
-          pthread_cond_wait(&state->cond, &state->mutex);
-        }
-        pthread_mutex_unlock(&state->mutex);
+    if (*comm->abortFlag) {
+      pthread_mutex_unlock(&state->opsMutex);
+      return NULL;
+    }
+
+    while (*opsPtr == NULL) {
+      if (state->stop) {
+        // No more commands to process and proxy has been requested to stop
+        pthread_mutex_unlock(&state->opsMutex);
+        return NULL;
       }
-    } while (op == NULL);
-    op->idle = 0;
-    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
-    // yet and might be cancelled before they even start. Hold on on those.
-    if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
+      pthread_cond_wait(&state->cond, &state->opsMutex);
+    }
+    int idle = 1;
+    ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
     if (ret != ncclSuccess) {
       comm->fatalError = ret;
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+      pthread_mutex_unlock(&state->opsMutex);
       return NULL;
     }
-    idle &= op->idle;
-    pthread_mutex_lock(&state->mutex);
-    if (!idle) idleSpin = 0;
-    struct ncclProxyArgs *next = op->next;
-    if (next->state == ncclProxyOpNone) {
-      struct ncclProxyArgs *freeOp = next;
-      if (next->nextPeer) {
-        // Replace next by its next per-peer element.
-        next = next->nextPeer;
-        if (op != freeOp) {
-          next->next = freeOp->next;
-          op->next = next;
-        } else {
-          next->next = next;
-        }
-      } else {
-        // Remove next from circular list
-        next->connector->proxyAppend = NULL;
-        if (op != freeOp) {
-          next = next->next;
-          op->next = next;
-        } else {
-          next = NULL;
-        }
-      }
-      if (freeOp == state->ops) state->ops = next;
-      freeOp->next = state->pool;
-      state->pool = freeOp;
+    if (idle) {
+      pthread_mutex_unlock(&state->opsMutex);
+      sched_yield(); // No request progressed. Let others run.
+      pthread_mutex_lock(&state->opsMutex);
     }
-    op = next;
-    if (op == state->ops) {
-      if (idle == 1) {
-        if (++idleSpin == 10) {
-          sched_yield();
-          idleSpin = 0;
-        }
-      }
-      idle = 1;
-    }
-    pthread_mutex_unlock(&state->mutex);
   }
 }
 
 ncclResult_t ncclProxyStart(struct ncclComm* comm) {
-  pthread_mutex_lock(&comm->proxyState.mutex);
-  if (comm->proxyState.ops != NULL)
-    pthread_cond_signal(&comm->proxyState.cond);
-  pthread_mutex_unlock(&comm->proxyState.mutex);
+  struct ncclProxyState* state = &comm->proxyState;
+  pthread_mutex_lock(&state->opsMutex);
+
+  // Sort operations as we append them : collectives and
+  // receives first, then sends.
+  ncclProxyArgs* next, *prev = NULL, *op = state->nextOps;
+  while (op) {
+    next = op->next;
+    if (op->sendbytes) {
+      if (prev) prev->next = next;
+      else state->nextOps = next;
+      op->next = NULL;
+      NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
+    } else prev = op;
+    op = next;
+  }
+  op = state->nextOps;
+  while (op) {
+    next = op->next;
+    op->next = NULL;
+    NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared));
+    op = next;
+  }
+  state->nextOps = state->nextOpsEnd = NULL;
+  NCCLCHECK(dumpProxyState(state));
+
+  if (state->ops != NULL)
+    pthread_cond_signal(&state->cond);
+  pthread_mutex_unlock(&state->opsMutex);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2);
+
+ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  if (state == NULL) {
+    NCCLCHECK(ncclCalloc(&state, 1));
+    comm->proxyState.sharedBuffs = state;
+    state->nslots = ncclParamProxySharedBuffersCount();
+    if (state->nslots == -2)  {
+      state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS;
+    }
+    state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
+  }
+
+  char* buff;
+  int* used;
+  *size = 2*comm->p2pnChannels*state->slotSize*state->nslots;
+
+  if (cuda && state->cudaBuff[0] == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&buff, *size));
+    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
+    for (int i=0; i<2*comm->p2pnChannels; i++) {
+      state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
+      state->cudaUsed[i] = used + state->nslots*i;
+    }
+  } else if (state->hostBuff[0] == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
+    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
+    for (int i=0; i<2*comm->p2pnChannels; i++) {
+      state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
+      state->hostUsed[i] = used + state->nslots*i;
+    }
+  }
+  buff = cuda ? state->cudaBuff[0] : state->hostBuff[0];
+
+  *ptr = buff;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  // Use different pools for different channels and also separate send/recv.
+  int p = 2*channel+type;
+  int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
+  char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
+  if (buff == NULL) return ncclInternalError;
+  int nslots = 1;
+  while (nslots*state->slotSize < size) nslots *= 2;
+  for (int s=0; s<state->nslots; s+=nslots) {
+    int u = 0;
+    for (int i=0; i<nslots; i++) u += used[s+i];
+    if (u == 0) {
+      for (int i=0; i<nslots; i++) used[s+i] = 1;
+      *ptr = buff+state->slotSize*s;
+      return ncclSuccess;
+    }
+  }
+  *ptr = NULL;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  int p = 2*channel+type;
+  int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p];
+  char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p];
+  if (buff == NULL) return ncclInternalError;
+  int nslots = 1;
+  while (nslots*state->slotSize < size) nslots *= 2;
+  int s = (ptr-buff)/state->slotSize;
+  if (s < 0 || s+nslots > state->nslots) {
+    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
+    return ncclInternalError;
+  }
+  for (int i=0; i<nslots; i++) used[s+i] = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
+  struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs;
+  if (state) {
+    CUDACHECK(cudaFree(state->cudaBuff[0]));
+    free(state->cudaUsed[0]);
+    NCCLCHECK(ncclCudaHostFree(state->hostBuff[0]));
+    free(state->hostUsed[0]);
+    free(state);
+  }
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
   if (!comm->proxyThread) {
     comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
-    comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
     comm->proxyState.ops = NULL;
     pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
   }
@@ -263,21 +514,23 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
   struct ncclProxyState* state = &comm->proxyState;
 
   // Request the proxy to stop and then wake it
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->opsMutex);
   state->stop = true;
   pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->mutex);
+  pthread_mutex_unlock(&state->opsMutex);
   if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
 
   // Free off any memory allocated for the proxy arg pools
-  pthread_mutex_lock(&state->mutex);
+  pthread_mutex_lock(&state->poolMutex);
   struct ncclProxyState* proxyState = &comm->proxyState;
   while (proxyState->pools != NULL) {
     struct ncclProxyPool *next = proxyState->pools->next;
     free(proxyState->pools);
     proxyState->pools = next;
   }
-  pthread_mutex_unlock(&state->mutex);
+  pthread_mutex_unlock(&state->poolMutex);
+
+  NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
 
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index 7219ea3798..a5af54117a 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -19,15 +19,15 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
 };
 
 template <int type>
-static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
+static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
   for (int t=0; t<NTRANSPORTS; t++) {
     struct ncclTransport *transport = ncclTransports+t;
     struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
     int ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
     if (ret) {
       connector->transportComm = transportComm;
-      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
+      NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId));
       return ncclSuccess;
     }
   }
@@ -35,53 +35,87 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
   return ncclInternalError;
 }
 
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
   TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
-  struct ncclConnect connect;
-  struct ncclConnector* conn;
+  uint32_t mask = 1 << channel->id;
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) { ++nSkippedRecv; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue;
+    comm->connectRecv[peer] |= mask;
   }
   for (int i=0; i<nsend; i++) {
     int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) { ++nSkippedSend; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue;
+    comm->connectSend[peer] |= mask;
   }
-  for (int i=0; i<nsend; i++) {
-    int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) {++nSkippedSend; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
-    conn->connected = 1;
-    CUDACHECK(cudaMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
-  }
-  for (int i=0; i<nrecv; i++) {
-    int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) {++nSkippedRecv; continue; }
-    memset(&connect, 0, sizeof(connect));
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
-    conn->connected = 1;
-    CUDACHECK(cudaMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
-  }
-  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
   return ncclSuccess;
 }
 
+void dumpData(struct ncclConnect* data, int ndata) {
+  for (int n=0; n<ndata; n++) {
+    printf("[%d] ", n);
+    uint8_t* d = (uint8_t*)data;
+    for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
+    printf("\n");
+  }
+}
+
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
+  struct ncclConnect data[2*MAXCHANNELS];
+  for (int i=1; i<comm->nRanks; i++) {
+    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+    int sendPeer = (comm->rank + i) % comm->nRanks;
+    uint32_t recvMask = comm->connectRecv[recvPeer];
+    uint32_t sendMask = comm->connectSend[sendPeer];
+
+    struct ncclConnect* recvData = data;
+    int sendChannels = 0, recvChannels = 0;
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (recvMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
+        NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
+      }
+    }
+    struct ncclConnect* sendData = recvData+recvChannels;
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (sendMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
+        NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
+      }
+    }
+
+    if (sendPeer == recvPeer) {
+      if (recvChannels+sendChannels) {
+         NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         sendData = data;
+         recvData = data+sendChannels;
+      }
+    } else {
+      if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
+      if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
+      if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels));
+      if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels));
+    }
+
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (sendMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
+        NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
+        conn->connected = 1;
+        CUDACHECK(cudaMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
+      }
+    }
+    for (int c=0; c<MAXCHANNELS; c++) {
+      if (recvMask & (1<<c)) {
+        struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
+        NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
+        conn->connected = 1;
+        CUDACHECK(cudaMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
+      }
+    }
+    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
+  }
+  return ncclSuccess;
+}
 
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 132f4fa291..af865ce80b 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -26,10 +26,8 @@ struct reqSlot {
 
 struct collNetSendResources {
   void* collNetSendComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
   uint32_t* llData;
   int netDev;
   int useGdr;
@@ -45,10 +43,8 @@ struct collNetSendResources {
 struct collNetRecvResources {
   void* netListenComm;
   void* collNetRecvComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
   uint32_t* llData;
   int netDev;
   int useGdr;
@@ -67,16 +63,15 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
 }
 
 /* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
   struct collNetSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
 
-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
-  resources->devHostSendMem = resources->hostSendMem;
+  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
 
   int recvSize = offsetof(struct ncclRecvMem, buff);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
@@ -84,8 +79,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
   if (resources->useGdr) {
     NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
   }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
-  resources->devHostRecvMem = resources->hostRecvMem;
+  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
   NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
 
   INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
@@ -94,16 +88,15 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
 }
 
 /* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
   struct collNetRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
 
-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
-  resources->devHostSendMem = resources->hostSendMem;
+  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
 
   int recvSize = offsetof(struct ncclRecvMem, buff);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
@@ -111,8 +104,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
   if (resources->useGdr) {
     NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
   }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
-  resources->devHostRecvMem = resources->hostRecvMem;
+  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
 
   NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
 
@@ -123,25 +115,25 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
   return ncclSuccess;
 }
 
-ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
   struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
 
   // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+    send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
     offset += send->comm->buffSizes[p];
   }
   send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
 
   // Head/Tail/Opcount/Fifos are always on host
-  send->conn.tail = &resources->devHostRecvMem->tail;
-  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
-  send->conn.head = &resources->devHostSendMem->head;
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+  send->conn.tail = &resources->recvMem->tail;
+  send->conn.sizesFifo = resources->recvMem->sizesFifo;
+  send->conn.head = &resources->sendMem->head;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
 
   // Get info from recv side
   resources->collNetRank = rank;
@@ -159,24 +151,24 @@ ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, in
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
   struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
   resources->collNetRank = rank;
 
   // Intermediate buffering on GPU for GPU Direct RDMA
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
     offset += recv->comm->buffSizes[p];
   }
   recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
 
   // Head/Tail/Opcount are always on host
-  recv->conn.tail = &resources->devHostRecvMem->tail;
-  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.tail = &resources->recvMem->tail;
+  recv->conn.head = &resources->sendMem->head;
 
   // Connect to coll comm
   collNetHandle_t** handlePtrs = NULL;
@@ -213,8 +205,8 @@ cleanup:
 
 ncclResult_t collNetSendFree(void* sendTransportResources) {
   struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
   if (resources->collNetSendComm) {
     NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
     NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
@@ -228,12 +220,12 @@ ncclResult_t collNetSendFree(void* sendTransportResources) {
 
 ncclResult_t collNetRecvFree(void* recvTransportResources) {
   struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
   if (resources->collNetRecvComm) {
     NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
     NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
   }
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
   if (resources->useGdr)
     CUDACHECK(cudaFree(resources->devRecvMem));
   free(resources->llData);
@@ -256,96 +248,84 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     // Round to next multiple of sliceSteps
     resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
     args->state = ncclProxyOpProgress;
   }
+  args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
     int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
     char* localBuff = args->connector->conn.buffs[p];
     void* sendMhandle = resources->sendMhandles[p];
     void* recvMhandle = resources->recvMhandles[p];
-    args->idle = 1;
     struct reqSlot* reqFifo = resources->reqFifo;
-    if (args->head < args->end) {
-      int buffSlot = args->tail%NCCL_STEPS;
-      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
-          && reqFifo[buffSlot].recvBuff != NULL) {
-        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
-        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+    int buffSlot = args->transmitted%NCCL_STEPS;
+    if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS
+        && reqFifo[buffSlot].recvBuff != NULL) {
+      volatile int* sizesFifo = resources->recvMem->sizesFifo;
+      volatile uint64_t* recvTail = &resources->recvMem->tail;
+      if (sizesFifo[buffSlot] != -1 && (*recvTail > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
+        // We have something to receive, let's check if it's completely ready.
+        int size = sizesFifo[buffSlot];
+        char* buff = localBuff+buffSlot*stepSize;
+        int ready = 1;
         if (args->protocol == NCCL_PROTO_LL) {
-          int size = sizesFifo[buffSlot];
-          if (size != -1) {
-            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
-            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            int ready = 1;
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
-            }
-            if (ready) {
-              int stepLines = stepSize / sizeof(union ncclLLFifoLine);
-              //separate data from flag
-              uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines;  // each line has two data elements
-              for (int i=0; i<nFifoLines; i++) {
-                volatile uint32_t *d1 = &lines[i].data1;
-                volatile uint32_t *d2 = &lines[i].data2;
-                sendBuff[2*i] = d1[0];
-                sendBuff[2*i+1] = d2[0];
-              }
-              int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
-              NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
-              if (args->requests[buffSlot] != NULL) {
-                TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
-                sizesFifo[buffSlot] = -1;
-                // Make sure size is reset to zero before we update the head.
-                __sync_synchronize();
-                args->tail += args->sliceSteps;
-                args->idle = 0;
-              }
-            }
-          }
-        } else if (args->tail < *recvTail) {
-          // Send through network
-          if (sizesFifo[buffSlot] != -1) {
-            int count = sizesFifo[buffSlot]/ncclTypeSize(args->dtype);
-            NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
-            if (args->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
-              sizesFifo[buffSlot] = -1;
-              // Make sure size is reset to zero before we update the head.
-              __sync_synchronize();
-              args->tail += args->sliceSteps;
-              args->idle = 0;
-            }
+          uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
+          // Pack data into another buffer
+          int stepLines = stepSize / sizeof(union ncclLLFifoLine);
+          uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines;  // each line has two data elements
+          buff = (char*)sendBuff;
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *d1 = &lines[i].data1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            volatile uint32_t *d2 = &lines[i].data2;
+            if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+            sendBuff[2*i] = d1[0];
+            sendBuff[2*i+1] = d2[0];
           }
+          size = nFifoLines*2*sizeof(uint32_t);
         }
-      }
-      if (args->head < args->tail) {
-        int done, size;
-        int buffSlot = args->head%NCCL_STEPS;
-        NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
-        if (done) {
-          TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
-          reqFifo[buffSlot].size = size;
-          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
-          // (reordered store after store is possible on POWER, though not on x86)
-          __sync_synchronize();
-          reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
-          args->head += args->sliceSteps;
-          resources->hostSendMem->head = args->head;
-          args->idle = 0;
+        if (ready) {
+          // Data is ready, try to send.
+          int count = size/ncclTypeSize(args->dtype);
+          NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            sizesFifo[buffSlot] = -1;
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->transmitted += args->sliceSteps;
+            args->idle = 0;
+            return ncclSuccess;
+          }
         }
       }
     }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    // Check whether the network has completed some send operations.
+    if (args->done < args->transmitted) {
+      int done, size;
+      int buffSlot = args->done%NCCL_STEPS;
+      NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
+      if (done) {
+        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
+        reqFifo[buffSlot].size = size;
+        // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
+        // (reordered store after store is possible on POWER, though not on x86)
+        __sync_synchronize();
+        reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
+        args->done += args->sliceSteps;
+        resources->sendMem->head = args->done;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+        return ncclSuccess;
+      }
     }
   }
   return ncclSuccess;
@@ -360,56 +340,79 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     // Round to next multiple of sliceSteps
     resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->received = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
     args->state = ncclProxyOpProgress;
   }
+  args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    args->idle = 1;
     int p = args->protocol;
     int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
     char* localBuff = args->connector->conn.buffs[p];
     void* mhandle = resources->mhandles[p];
     struct reqSlot* reqFifo = resources->reqFifo;
-    if (args->head < args->end) {
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
-        int buffSlot = args->tail%NCCL_STEPS;
-        char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
-        int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
-        reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
-        TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
-        args->tail += args->sliceSteps;
-        args->idle = 0;
-      }
-      if (args->tail > args->head) {
-        int buffSlot = args->head%NCCL_STEPS;
-        if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
-          TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
-          args->head += args->sliceSteps;
-          if (args->protocol == NCCL_PROTO_LL) { // ll
-            // re-attach flag
-            uint32_t flag = args->head;
-            int stepLines = stepSize / sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
-            int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
-            for (int i=0; i<nFifoLines; i++) {
-              lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
-              lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
-            }
-          } else if (args->protocol == NCCL_PROTO_SIMPLE) {
-            if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
-            resources->hostRecvMem->tail = args->head;
+    if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
+      int buffSlot = args->posted%NCCL_STEPS;
+      char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
+      int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
+      reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
+      TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
+      args->posted += args->sliceSteps;
+      args->idle = 0;
+      return ncclSuccess;
+    }
+    if (args->posted > args->received) {
+      int buffSlot = args->received%NCCL_STEPS;
+      if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
+        TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, reqFifo[buffSlot].size);
+        if (args->protocol == NCCL_PROTO_LL) { // ll
+          // re-attach flag
+          uint32_t flag = NCCL_LL_FLAG(args->received + 1);
+          int stepLines = stepSize / sizeof(union ncclLLFifoLine);
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
+          uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
+          int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
+          for (int i=0; i<nFifoLines; i++) {
+            lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
+            lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
           }
-          args->idle = 0;
         }
+        args->received += args->sliceSteps;
+        if (reqFifo[buffSlot].size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
+          NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle, args->requests+buffSlot));
+        } else {
+          args->requests[buffSlot] = NULL;
+        }
+        args->idle = 0;
+        return ncclSuccess;
       }
     }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    if (args->received > args->transmitted) {
+      // Progress flush operations
+      int buffSlot = args->transmitted%NCCL_STEPS;
+      int done = 1;
+      if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        args->transmitted += args->sliceSteps;
+        __sync_synchronize();
+        resources->recvMem->tail = args->transmitted;
+        args->idle = 0;
+        return ncclSuccess;
+      }
+    }
+    if (args->transmitted > args->done) {
+      volatile uint64_t* sendHead = &resources->sendMem->head;
+      uint64_t done = *sendHead;
+      while (done > args->done &&
+             // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+             args->transmitted > args->done) {
+        args->done += args->sliceSteps;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+      }
     }
   }
   return ncclSuccess;
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 49cd8d2bba..86c43f87a0 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "net.h"
 #include "graph.h"
+#include "collectives.h"
 
 struct netConnectInfo {
   ncclNetHandle_t netHandle;
@@ -22,6 +23,7 @@ struct netSendResources {
   struct ncclRecvMem* recvMem;
   int netDev;
   int useGdr;
+  int shared;
   char* buffers[LOC_COUNT];
   int buffSizes[LOC_COUNT];
   void* mhandles[LOC_COUNT];
@@ -37,6 +39,7 @@ struct netRecvResources {
   struct ncclRecvMem* recvMem;
   int netDev;
   int useGdr;
+  int shared;
   char* buffers[LOC_COUNT];
   int buffSizes[LOC_COUNT];
   void* mhandles[LOC_COUNT];
@@ -51,108 +54,118 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
   return ncclSuccess;
 }
 
+NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
   struct netSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
+  send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
 
-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
 
   NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
   NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
 
   send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
   send->conn.tail = &resources->recvMem->tail;
-  send->conn.fifo = resources->recvMem->sizesFifo;
+  send->conn.sizesFifo = resources->recvMem->sizesFifo;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
   send->conn.head = &resources->sendMem->head;
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+  resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
 
-  int protoLoc[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+  if (resources->shared == 0) {
+    int protoLoc[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    }
+    int buffSizes[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      buffSizes[p] = send->comm->buffSizes[p];
+      resources->buffSizes[protoLoc[p]] += buffSizes[p];
+    }
+
+    if (resources->buffSizes[LOC_DEVMEM]) {
+      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
+    }
+    if (resources->buffSizes[LOC_HOSTMEM]) {
+      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+    }
+
+    int offsets[LOC_COUNT];
+    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+      send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+      offsets[protoLoc[p]] += buffSizes[p];
+    }
   }
 
-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    // Only allocate buffers for simple for p2p connections
-    buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
-    resources->buffSizes[protoLoc[p]] += buffSizes[p];
-  }
-
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
-  }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-  }
-
-  int offsets[LOC_COUNT];
-  offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-    send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-    offsets[protoLoc[p]] += buffSizes[p];
-  }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
-ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
   struct netRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
+  recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
 
-  NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
 
   NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
   NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
 
   recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
   recv->conn.tail = &resources->recvMem->tail;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
   recv->conn.head = &resources->sendMem->head;
 
-  int protoLoc[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
+    int protoLoc[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    }
+
+    int buffSizes[NCCL_NUM_PROTOCOLS];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      buffSizes[p] = recv->comm->buffSizes[p];
+      resources->buffSizes[protoLoc[p]] += buffSizes[p];
+    }
+
+    if (resources->buffSizes[LOC_DEVMEM]) {
+      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
+    }
+    if (resources->buffSizes[LOC_HOSTMEM]) {
+      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+    }
+
+    int offsets[LOC_COUNT];
+    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+      recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+      offsets[protoLoc[p]] += buffSizes[p];
+    }
   }
 
-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    // Only allocate buffers for simple for p2p connections
-    buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
-    resources->buffSizes[protoLoc[p]] += buffSizes[p];
-  }
-
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
-  }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-  }
-
-  int offsets[LOC_COUNT];
-  offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-    recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-    offsets[protoLoc[p]] += buffSizes[p];
-  }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
   struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
   NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
 
   return ncclSuccess;
 }
 
-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct netSendResources* resources = (struct netSendResources*)send->transportResources;
   struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
@@ -160,6 +173,13 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
   // Connect to remote peer
   NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
 
+  if (resources->shared) {
+    // Get shared buffers
+    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
+    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+  }
+
   if (resources->buffSizes[LOC_DEVMEM]) {
     NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
   }
@@ -170,7 +190,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
 }
 
 /* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
 
@@ -178,6 +198,13 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
   NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
   NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
 
+  if (resources->shared) {
+    // Get shared buffers
+    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+    NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
+    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+  }
+
   if (resources->buffSizes[LOC_DEVMEM]) {
     NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
   }
@@ -195,8 +222,10 @@ ncclResult_t netSendFree(void* transportResources) {
     if (resources->buffers[l])
       NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
   }
-  NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-  CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  if (resources->shared == 0) {
+    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+    CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  }
   NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
@@ -210,115 +239,121 @@ ncclResult_t netRecvFree(void* transportResources) {
     if (resources->buffers[l])
       NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
   }
-  NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-  CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  if (resources->shared == 0) {
+    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+    CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  }
   NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   free(resources);
   return ncclSuccess;
 }
 
+static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
+
 ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
   if (args->state == ncclProxyOpReady) {
     // Round to next multiple of sliceSteps
     resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
     args->state = ncclProxyOpProgress;
   }
+  args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
     int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
     char* localBuff = args->connector->conn.buffs[p];
     void* mhandle = *(resources->mhandlesProto[p]);
-    args->idle = 1;
-    if (args->head < args->end) {
-      int buffSlot = args->tail%NCCL_STEPS;
-      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
-        volatile int* sizesFifo = resources->recvMem->sizesFifo;
-        volatile uint64_t* recvTail = &resources->recvMem->tail;
+    int buffSize = stepSize*args->sliceSteps;
+    if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
+    if (args->sendbytes < buffSize) buffSize = args->sendbytes;
+    // Post buffers to the GPU
+    if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) {
+      if (resources->shared) {
+        char* ptr;
+        NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr));
+        if (ptr == NULL) return ncclInternalError;
+        resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr;
+        __sync_synchronize();
+        volatile uint64_t* sendHead = &resources->sendMem->head;
+        args->posted += args->sliceSteps;
+        *sendHead = args->posted - NCCL_STEPS;
+      } else args->posted += args->sliceSteps;
+      args->idle = 0;
+      return ncclSuccess;
+    }
+    // Check whether we received data from the GPU and send it to the network
+    int buffSlot = args->transmitted%NCCL_STEPS;
+    if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) {
+      volatile int* sizesFifo = resources->recvMem->sizesFifo;
+      volatile uint64_t* recvTail = &resources->recvMem->tail;
+      if (sizesFifo[buffSlot] != -1 && (*recvTail > args->transmitted || args->protocol == NCCL_PROTO_LL)) {
+        // We have something to receive, let's check if it's completely ready.
+        int size = sizesFifo[buffSlot];
+        char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+        int ready = 1;
         if (args->protocol == NCCL_PROTO_LL128) {
-          if (args->tail < *recvTail) {
-            if (sizesFifo[buffSlot] != -1) {
-              int ready = resources->useGdr;
-              if (!ready) {
-                // When data is in sysmem, we need to wait until all flags are correct since the GPU only
-                // called threadfence()
-                uint64_t flag = args->tail + 1;
-                int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
-                volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
-                ready = 1;
-                for (int i=0; i<nFifoLines; i++) {
-                  if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
-                }
-              }
-              if (ready) {
-                // Send through network
-                NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
-                if (args->requests[buffSlot] != NULL) {
-                  sizesFifo[buffSlot] = -1;
-                  // Make sure size is reset to zero before we update the head.
-                  __sync_synchronize();
-                  args->tail += args->sliceSteps;
-                  args->idle = 0;
-                }
-              }
+          int ready = resources->useGdr;
+          if (!ready) {
+            // When data is in sysmem, we need to wait until all flags are correct since the GPU only
+            // called threadfence()
+            uint64_t flag = args->transmitted + 1;
+            int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+            volatile uint64_t* lines = (volatile uint64_t*)buff;
+            ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
             }
           }
         } else if (args->protocol == NCCL_PROTO_LL) {
-          int size = sizesFifo[buffSlot];
-          if (size != -1) {
-            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
-            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-            size = nFifoLines * sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            int ready = 1;
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
-            }
-            if (ready) {
-              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
-              if (args->requests[buffSlot] != NULL) {
-                sizesFifo[buffSlot] = -1;
-                // Make sure size is reset to zero before we update the head.
-                __sync_synchronize();
-                args->tail += args->sliceSteps;
-                args->idle = 0;
-              }
-            }
-          }
-        } else if (args->tail < *recvTail) {
-          // Send through network
-          if (sizesFifo[buffSlot] != -1) {
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
-            if (args->requests[buffSlot] != NULL) {
-              sizesFifo[buffSlot] = -1;
-              // Make sure size is reset to zero before we update the head.
-              __sync_synchronize();
-              args->tail += args->sliceSteps;
-              args->idle = 0;
-            }
+          uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1);
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff;
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
           }
         }
-      }
-      if (args->head < args->tail) {
-        int done;
-        int buffSlot = args->head%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
-        if (done) {
-          args->head += args->sliceSteps;
-          resources->sendMem->head = args->head;
-          args->idle = 0;
+        if (ready) {
+          // Data is ready, try to send.
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            sizesFifo[buffSlot] = -1;
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->transmitted += args->sliceSteps;
+            args->idle = 0;
+            return ncclSuccess;
+          }
         }
       }
     }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    // Check whether the network has completed some send operations.
+    if (args->done < args->transmitted) {
+      int done;
+      int buffSlot = args->done%NCCL_STEPS;
+      NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
+        if (resources->shared) {
+          char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
+          NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr));
+        }
+        args->done += args->sliceSteps;
+
+        if (resources->shared == 0) {
+          resources->sendMem->head = args->done;
+        }
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+        return ncclSuccess;
+      }
     }
   }
   return ncclSuccess;
@@ -329,46 +364,88 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     // Round to next multiple of sliceSteps
     resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->head = resources->step;
-    args->tail = resources->step;
-    args->end = args->head + args->nsteps;
+    args->posted = args->received = args->transmitted = args->done = resources->step;
+    args->end = resources->step + args->nsteps;
     args->state = ncclProxyOpProgress;
   }
+  args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    args->idle = 1;
     int p = args->protocol;
     int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
     char* localBuff = args->connector->conn.buffs[p];
     void* mhandle = *(resources->mhandlesProto[p]);
-    if (args->head < args->end) {
-      volatile uint64_t* sendHead = &resources->sendMem->head;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
-        int buffSlot = args->tail%NCCL_STEPS;
-        int sliceSize = stepSize * args->sliceSteps;
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
-        if (args->requests[buffSlot] != NULL) {
-          args->tail += args->sliceSteps;
-          args->idle = 0;
-        }
+    int buffSize = stepSize*args->sliceSteps;
+    if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
+    if (args->recvbytes < buffSize) buffSize = args->recvbytes;
+    if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) {
+      int buffSlot = args->posted%NCCL_STEPS;
+      char* ptr;
+      if (resources->shared) {
+        NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr));
+        if (ptr == NULL) return ncclInternalError;
+        volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
+        ptrsFifo[buffSlot] = ptr;
+      } else {
+        ptr = localBuff+buffSlot*stepSize;
       }
-      if (args->tail > args->head) {
-        int buffSlot = args->head%NCCL_STEPS;
-        int done, size;
-        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
-        if (done) {
-          args->head += args->sliceSteps;
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
-            resources->recvMem->tail = args->head;
-          }
-          args->idle = 0;
-        }
+      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
+      if (args->requests[buffSlot] != NULL) {
+        TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
+        args->posted += args->sliceSteps;
+        args->idle = 0;
+        return ncclSuccess;
+      } else if (resources->shared) {
+        NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
       }
     }
-    if (args->head == args->end) {
-      resources->step = args->end;
-      args->idle = 0;
-      args->state = ncclProxyOpNone;
+    if (args->posted > args->received) {
+      int buffSlot = args->received%NCCL_STEPS;
+      int done, size;
+      NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+      if (done) {
+        args->received += args->sliceSteps;
+        if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) {
+          // Don't pass data to the GPU yet, flush first.
+          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
+          char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
+          NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot));
+        } else {
+          args->requests[buffSlot] = NULL;
+        }
+        args->idle = 0;
+        return ncclSuccess;
+      }
+    }
+    if (args->received > args->transmitted) {
+      // Progress flush operations
+      int buffSlot = args->transmitted%NCCL_STEPS;
+      int done = 1;
+      if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+      if (done) {
+        args->transmitted += args->sliceSteps;
+        __sync_synchronize();
+        resources->recvMem->tail = args->transmitted;
+        args->idle = 0;
+        return ncclSuccess;
+      }
+    }
+    if (args->transmitted > args->done) {
+      volatile uint64_t* sendHead = &resources->sendMem->head;
+      uint64_t done = *sendHead;
+      while (done > args->done &&
+          // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+          args->transmitted > args->done) {
+        if (resources->shared) {
+          char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS];
+          NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr));
+        }
+        args->done += args->sliceSteps;
+        args->idle = 0;
+        if (args->done == args->end) {
+          resources->step = args->end;
+          args->state = ncclProxyOpNone;
+        }
+      }
     }
   }
   return ncclSuccess;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 97eca9f8b5..f8242a539c 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -24,9 +24,8 @@
 #include "ibvwrap.h"
 
 #define USE_RDMA_WRITE 1
-#define USE_RDMA_SEND_INLINE 0
 #define MAXNAMESIZE 64
-static char ncclIbIfName[MAX_IF_NAME_SIZE];
+static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union socketAddress ncclIbIfAddr;
 
 static int ncclNIbDevs = -1;
@@ -57,6 +56,8 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
 NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
+NCCL_PARAM(IbPkey, "IB_PKEY", 0);
+NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
@@ -199,7 +200,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
             ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
       }
       line[1023] = '\0';
-      char addrline[1024];
+      char addrline[SOCKET_NAME_MAXLEN+1];
       INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
     }
     pthread_mutex_unlock(&ncclIbLock);
@@ -246,7 +247,7 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   return ncclSuccess;
 }
 
-#define MAX_REQUESTS 128
+#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
 
 struct ncclIbQpInfo {
   uint32_t lid;
@@ -267,18 +268,19 @@ struct ncclIbHandle {
   union socketAddress connectAddr;
 };
 
-struct ncclIbVerbs {
-  struct ibv_pd* pd;
-  struct ibv_cq* cq;
-};
-
 struct ncclIbRequest {
   int used;
   int type;
   struct ncclIbVerbs* verbs;
-  int done;
+  int events;
   int size;
-  int free;
+};
+
+struct ncclIbVerbs {
+  struct ibv_pd* pd;
+  struct ibv_cq* cq;
+  uint64_t pad[2];
+  struct ncclIbRequest reqs[MAX_REQUESTS];
 };
 
 struct ncclIbListenComm {
@@ -292,18 +294,23 @@ struct ncclIbSendFifo {
   uint32_t seq;
   uint32_t rkey;
   uint32_t ready;
+  uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
 };
 
 struct ncclIbSendComm {
   struct ncclIbVerbs verbs;
   struct ncclIbSendFifo fifo[MAX_REQUESTS];
-  struct ncclIbRequest reqs[MAX_REQUESTS];
   uint32_t fifoHead;
   int fd;
   int ready;
   struct ibv_qp* qp;
   struct ibv_mr* fifoMr;
 };
+// The SendFifo needs to be 32-byte aligned and each element needs
+// to be a 32-byte multiple, so that an entry does not get split and
+// written out of order when IB Relaxed Ordering is enabled
+static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
+static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
 
 struct ncclIbGpuFlush {
   int enabled;
@@ -326,16 +333,17 @@ struct ncclIbRemFifo {
 struct ncclIbRecvComm {
   struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
-  struct ncclIbRequest reqs[MAX_REQUESTS];
   int fd;
   int ready;
   struct ibv_qp* qp;
   struct ncclIbGpuFlush gpuFlush;
 };
+static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
 
 ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
   NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
-  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
+  // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
+  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS, NULL, NULL, 0));
   return ncclSuccess;
 }
 
@@ -351,17 +359,17 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
   qpInitAttr.send_cq = verbs->cq;
   qpInitAttr.recv_cq = verbs->cq;
   qpInitAttr.qp_type = IBV_QPT_RC;
-  // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
+  // We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
   qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
   qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
   qpInitAttr.cap.max_send_sge = 1;
   qpInitAttr.cap.max_recv_sge = 1;
-  qpInitAttr.cap.max_inline_data = 0;
+  qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
   NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_INIT;
-  qpAttr.pkey_index = 0;
+  qpAttr.pkey_index = ncclParamIbPkey();
   qpAttr.port_num = ib_port;
   qpAttr.qp_access_flags = access_flags;
   NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
@@ -476,7 +484,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   socklen_t socklen = sizeof(struct sockaddr_in);
   SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
   struct ncclIbQpInfo remQpInfo;
-  NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
+  NCCLCHECK(socketRecv(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
 
   // IB setup
   ibv_context* ctx = ncclIbDevs[lComm->dev].context;
@@ -504,14 +512,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
   rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
   rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
-
-#if USE_RDMA_SEND_INLINE
-  // Determine whether the remFifo element data can be sent INLINE
-  struct ibv_qp_attr attr;
-  struct ibv_qp_init_attr init_attr;
-  NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
-  if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
-#endif
+  if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
 
   // Allocate Flush dummy buffer for GPU Direct RDMA
   rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
@@ -548,16 +549,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
+ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
   for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclIbRequest* r = reqs+i;
+    struct ncclIbRequest* r = verbs->reqs+i;
     if (r->used == 0) {
       r->used = 1;
       r->type = 0;
-      r->verbs = NULL;
-      r->done = 0;
+      r->verbs = verbs;
+      r->events = 1;
       r->size = -1;
-      r->free = 0;
       *req = r;
       return ncclSuccess;
     }
@@ -566,6 +566,10 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
   *req = NULL;
   return ncclInternalError;
 }
+ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
+  r->used = 0;
+  return ncclSuccess;
+}
 
 ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
   struct ncclIbQpInfo remQpInfo;
@@ -580,7 +584,6 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
   NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
   NCCLCHECK(ncclIbRtsQp(qp));
   comm->ready = 1;
-
   // Block until this is done. It *should* not block indefinitely.
   NCCLCHECK(socketSend(comm->fd, &comm->ready, sizeof(int)));
 
@@ -601,6 +604,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
 #define REG_ALIGN (4096)
 
 ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
   struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   uint64_t addr = (uint64_t)data;
   assert(size > 0);
@@ -634,8 +638,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
   if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
 
   struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
   req->size = size;
 
   struct ibv_send_wr wr;
@@ -651,23 +654,24 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
+#if USE_RDMA_WRITE == 0
   wr.opcode = IBV_WR_SEND;
   wr.send_flags = IBV_SEND_SIGNALED;
-
-  int useAr = 0;
-  if (size > ncclParamIbArThreshold()) {
-    useAr = 1;
-  }
-#if USE_RDMA_WRITE
+#else
   __sync_synchronize(); // order the readyPtr load against rkey load below
   // Sanity checks to catch user collective call count/size mismatches
   // plus any potential programming errors
-  if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
+  if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
     WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
         size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
     return ncclInternalError;
   }
+  int useAr = 0;
+  if (size > ncclParamIbArThreshold()) {
+    useAr = 1;
+  }
   wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED;
   wr.wr.rdma.remote_addr = slot->addr;
   wr.wr.rdma.rkey = slot->rkey;
   wr.imm_data = size; // Send the message size via imm_data
@@ -691,7 +695,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
     wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
     wr.sg_list = NULL;
     wr.num_sge = 0;
-    wr.send_flags &= ~IBV_SEND_SIGNALED;
+    wr.send_flags |= IBV_SEND_SIGNALED;
     NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
   }
 #endif
@@ -699,28 +703,51 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
-  req->free = 1; // Not a user req ; free as soon as it is complete.
-  wr.wr_id = (uint64_t)req;
 
-  struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
+  int slot = comm->remFifo.tail%MAX_REQUESTS;
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
   localElem->addr = addr;
   localElem->rkey = rkey;
   localElem->ready = 1;
   localElem->size = size; // Sanity/Debugging
   localElem->seq = comm->remFifo.tail; // Sanity/Debugging
-  wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
   wr.wr.rdma.rkey = comm->remFifo.rkey;
   comm->remFifo.sge.addr = (uint64_t)localElem;
   wr.sg_list = &comm->remFifo.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE;
-  wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
+  wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE
+
+  // We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise
+  // the send queue will never empty.
+  //
+  // From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/
+  // "How to use Unsignaled Completion?" / "Gotchas and Pitfalls"
+  // All posted Send Requested, Signaled and Unsignaled, are considered outstanding until
+  // a Work Completion that they, or Send Requests that were posted after them, was polled
+  // from the Completion Queue associated with the Send Queue. This means if one works with
+  // a Queue Pair that was configured to work with Unsignaled Completions, he must make
+  // sure that occasionally (before the Send Queue is full with outstanding Send Requests)
+  // a Send Request that generate Work Completion will be posted.
+  //
+  // Not following this rule may lead to a case that the Send Queue is full with Send
+  // Requests that won't generate Work Completion:
+  //
+  //  - The Send Queue is full, so no new Send Requests can be posted to it
+  //  - The Send Queue can't be emptied, since no Work Completion can be generated anymore
+  //    (the reason is that no Work Completion, that can generate Work Completion that
+  //    polling it will empty the Send Queue, can be posted)
+  //  - The status of all posted Send Request is considered unknown
+  //
+  if (slot == 0) {
+    wr.send_flags |= IBV_SEND_SIGNALED;
+    wr.wr_id = (uint64_t)req;
+    req->events++;
+  }
 
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
@@ -737,8 +764,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
   struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
   req->size = size;
 
   struct ibv_recv_wr wr;
@@ -760,17 +786,16 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, vo
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
+  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
+ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
 
   struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
   struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   struct ibv_send_wr wr;
@@ -787,11 +812,7 @@ ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
 
-  int done = 0;
-  while (done == 0) {
-    NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
-  }
-
+  *request = req;
   return ncclSuccess;
 }
 
@@ -800,10 +821,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
   *done = 0;
 
   while (1) {
-    if (r->done == 1) {
+    if (r->events == 0) {
       *done = 1;
       if (size) *size = r->size;
-      r->used = 0;
+      NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
 
@@ -828,11 +849,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
           doneReq->size = wc->imm_data;
 #endif
         }
-        doneReq->done = 1;
-        if (doneReq->free == 1) {
-          // This is an internal (FIFO post) req. Free it immediately.
-          doneReq->used = 0;
-        }
+        doneReq->events--;
       }
     }
   }
@@ -887,7 +904,7 @@ ncclNet_t ncclNetIb = {
   ncclIbDeregMr,
   ncclIbIsend,
   ncclIbIrecv,
-  ncclIbFlush,
+  ncclIbIflush,
   ncclIbTest,
   ncclIbCloseSend,
   ncclIbCloseRecv,
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 5bc22c3ca3..272d8cdc5c 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -48,17 +48,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
       } else {
-        char line[1024];
-        char addrline[1024];
+        #define MAX_LINE_LEN (2047)
+        char line[MAX_LINE_LEN+1];
+        char addrline[SOCKET_NAME_MAXLEN+1];
         line[0] = '\0';
+        addrline[SOCKET_NAME_MAXLEN] = '\0';
         for (int i=0; i<ncclNetIfs; i++) {
           strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
           memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
           NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
+          snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
               socketToString(&addrs[i].sa, addrline));
         }
-        line[1023] = '\0';
+        line[MAX_LINE_LEN] = '\0';
         INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
       }
     }
@@ -112,8 +114,7 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
 
 #define MAX_SOCKETS 64
 #define MAX_THREADS 16
-#define MAX_REQUESTS 128
-#define MAX_QUEUE_LEN MAX_REQUESTS
+#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
 #define MIN_CHUNKSIZE (64*1024)
 
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
@@ -149,6 +150,7 @@ struct ncclSocketRequest {
 
 struct ncclSocketTaskQueue {
   int next;
+  int len;
   struct ncclSocketTask* tasks;
 };
 
@@ -188,7 +190,7 @@ void* persistentSocketThread(void *args_) {
   while (1) {
     int idle = 1;
     int mark = myQueue->next; // mark newest task seen
-    for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
+    for (int i=0; i<myQueue->len; i+=nSocksPerThread) {
       int repeat;
       do {
         repeat = 0;
@@ -363,7 +365,11 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
   struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
   // create helper threads and prepare per-thread task queue
   if (queue->tasks == NULL) {
-    NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
+    // each request can be divided up to nSocks tasks, and
+    // these tasks are distributed to nThreads threads,
+    // we need to make sure each thread queue has enough slots for MAX_REQUESTS
+    queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads);
+    NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
     queue->next = 0;
     res->comm = comm;
     pthread_mutex_init(&res->threadLock, NULL);
@@ -382,7 +388,7 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
     r->used = 1;
     *req = r;
     pthread_mutex_lock(&res->threadLock);
-    queue->next = (queue->next+1)%MAX_QUEUE_LEN;
+    queue->next = (queue->next+1)%queue->len;
     res->state = start;
     pthread_cond_signal(&res->threadCond);
     pthread_mutex_unlock(&res->threadLock);
@@ -420,6 +426,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
     // divide into subtasks
     int chunkOffset = 0, i = 0;
     if (r->comm->nSocks > 0) {
+      // each request can be divided up to nSocks tasks
       int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
       while (chunkOffset < r->size) {
         int chunkSize = std::min(taskSize, r->size-chunkOffset);
@@ -477,7 +484,7 @@ ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
+ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
   // We don't support CUDA pointers, so we don't need a flush operation
   return ncclInternalError;
 }
@@ -526,7 +533,7 @@ ncclNet_t ncclNetSocket = {
   ncclSocketDeregMr,
   ncclSocketIsend,
   ncclSocketIrecv,
-  ncclSocketFlush,
+  ncclSocketIflush,
   ncclSocketTest,
   ncclSocketClose,
   ncclSocketClose,
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 2cbe390b29..e05a43344a 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -7,24 +7,29 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
+#include "bootstrap.h"
 
 struct p2pConnectInfo {
-  int direct;
+  int rank;
   int read;
-  union {
-    void* directPtr;
-    cudaIpcMemHandle_t devIpc;
-  };
+  void* directPtr;
+  cudaIpcMemHandle_t devIpc;
 };
 
 struct p2pSendResources {
   struct ncclSendMem* devMem;
   void* ipcPtr;
+  int remoteId;
+  int memRank;
+  void* bootstrap;
 };
 
 struct p2pRecvResources {
   struct ncclRecvMem* devMem;
   void* ipcPtr;
+  int remoteId;
+  int memRank;
+  void* bootstrap;
 };
 
 #include <sys/types.h>
@@ -55,9 +60,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
   }
 
   // Check topology / p2p level.
-  int read;
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
+  int intermediateRank;
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
   if (*ret == 0) return ncclSuccess;
+  if (intermediateRank != -1) return ncclSuccess;
 
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -100,145 +106,134 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
 
-static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  int readEnable = ncclParamP2pReadEnable();
-  if (readEnable != -2) return readEnable;
-
-  int p2p, read;
+static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
+  int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
 
-  return read;
+  int readEnable = ncclParamP2pReadEnable();
+  if (readEnable != -2) *read = readEnable;
+  return ncclSuccess;
 }
 
-/* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
-    struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
-
-  struct p2pSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  int useRead = p2pUseRead(topo, myInfo, peerInfo);
-  int sendSize = sizeof(struct ncclSendMem);
-  // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
-  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
-
-  struct p2pConnectInfo info;
-  info.read = useRead;
-  const char* useReadStr = info.read ? "/read" : "";
+static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
   if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
-          channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
-      return ncclInternalError;
-    } else {
+    if (peerInfo->cudaDev != myInfo->cudaDev) {
       // Enable P2P access
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
         WARN("failed to peer with device %d(=%lx): %d %s",
-             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+            peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+    }
+    *devMem = p2pInfo->directPtr;
+    *ipcPtr = NULL;
+  } else {
+    CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pInfo->devIpc, cudaIpcMemLazyEnablePeerAccess));
+    *ipcPtr = *devMem;
+  }
+  return ncclSuccess;
+}
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+
+  struct p2pSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+  int useRead, intermediateRank;
+  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  int sendSize = sizeof(struct ncclSendMem);
+  // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
+  if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
+  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
+
+  struct p2pConnectInfo info;
+  info.read = useRead;
+  const char* useReadStr = info.read ? "/read" : "";
+
+  resources->remoteId = -1;
+  resources->bootstrap = comm->bootstrap;
+  if (intermediateRank == -1) {
+    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize));
+    info.rank = myInfo->rank;
+    if (myInfo->pidHash == peerInfo->pidHash) {
+      if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU;
+      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
+    } else {
+      CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     }
   } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
-      return ncclInternalError;
-    }
-    INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
-        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
-    //TRACE_DUMP_IPC(&info.devIpc);
+    NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
+    info.rank = intermediateRank;
+    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
+        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
+	comm->peerInfo[intermediateRank].busId, useReadStr);
   }
+  resources->memRank = info.rank;
+
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
   memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
   return ncclSuccess;
 }
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
     struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
 
   struct p2pRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
-  int useRead = p2pUseRead(topo, myInfo, peerInfo);
+  int useRead, intermediateRank;
+  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
   int recvSize = offsetof(struct ncclRecvMem, buff);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
   ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
 
   struct p2pConnectInfo info;
   info.read = useRead;
-  if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+
+  resources->remoteId = -1;
+  resources->bootstrap = comm->bootstrap;
+  if (intermediateRank == -1) {
+    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize));
+    info.rank = myInfo->rank;
+    if (myInfo->pidHash == peerInfo->pidHash) {
+      if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU;
     } else {
-      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d(=%lx): %d %s",
-             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
-        return ncclInternalError;
-      }
-      TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+      CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
     }
   } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
-      return ncclInternalError;
-    }
-    TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
-    //TRACE_DUMP_IPC(&info.devIpc);
+    NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
+    info.rank = intermediateRank;
   }
+  resources->memRank = info.rank;
+
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
   memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
   return ncclSuccess;
 }
 
 /* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclRecvMem*)(info->directPtr);
-    if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
+
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -257,26 +252,12 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
 }
 
 /* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclSendMem*)(info->directPtr);
-    if (info->read == 0) {
-      recv->conn.direct |= NCCL_DIRECT_GPU;
-      recv->conn.ptrExchange = &remDevMem->ptrExchange;
-    }
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
+
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->ipcPtr));
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -290,6 +271,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
   }
   recv->conn.tail = &resources->devMem->tail;
   recv->conn.head = &remDevMem->head;
+  recv->conn.ptrExchange = &remDevMem->ptrExchange;
   return ncclSuccess;
 }
 
@@ -297,6 +279,10 @@ ncclResult_t p2pSendFree(void* resources) {
   struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
   if (sendRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+  if (sendRes->remoteId != -1) {
+    NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
+    sendRes->devMem = NULL;
+  }
   CUDACHECK(cudaFree(sendRes->devMem));
   free(sendRes);
   return ncclSuccess;
@@ -306,6 +292,10 @@ ncclResult_t p2pRecvFree(void* resources) {
   struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
   if (recvRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+  if (recvRes->remoteId != -1) {
+    NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
+    recvRes->devMem = NULL;
+  }
   CUDACHECK(cudaFree(recvRes->devMem));
   free(recvRes);
   return ncclSuccess;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 488f456273..d8a5b52d2c 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #define MAX_SHM_NAME_LEN 1024
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
 
   struct shmSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
@@ -81,7 +81,7 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
   struct shmRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
@@ -106,7 +106,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }
 
 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -131,7 +131,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;

From 399656269027c1818fc999ccf8ec4dd838cec50d Mon Sep 17 00:00:00 2001
From: Jonas Zhou <JonasZhou@zhaoxin.com>
Date: Fri, 27 Nov 2020 16:20:55 +0800
Subject: [PATCH 03/25] x86: Add CPU detection for Zhaoxin processors

Signed-off-by: Jonas Zhou <JonasZhou@zhaoxin.com>
---
 src/graph/paths.cc  |  3 +++
 src/graph/topo.cc   | 10 +++++++++-
 src/graph/topo.h    |  2 ++
 src/include/graph.h |  2 ++
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index c7525e60db..eb556c408b 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -283,6 +283,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
     else p2pLevel = PATH_PHB;
   }
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+    p2pLevel = PATH_PXB;
+  }
 
 compare:
   // Compute the PCI distance and compare with the p2pLevel.
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 3e395c52d1..d53b532ed5 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
     *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
   }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+    *width = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
+  }
   return ncclSuccess;
 }
 
@@ -364,7 +367,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
 }
 
 struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
-struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { "  Shanghai  ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
 
 ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
   int numaId;
@@ -387,6 +390,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
       cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+    } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+      int familyId, modelId;
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
     }
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
diff --git a/src/graph/topo.h b/src/graph/topo.h
index a12bb2d1d6..0cb6d335d7 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -19,6 +19,8 @@
 #define PCI_WIDTH 12.0           // PCI Gen3 x16
 #define QPI_WIDTH 6.0
 #define SKL_QPI_WIDTH 9.0
+#define ZPI_WIDTH 6.0
+#define YONGFENG_ZPI_WIDTH 9.0
 #define P9_WIDTH 32.0
 #define ARM_WIDTH 6.0
 #define NET_WIDTH 12.0           // 100Gbit
diff --git a/src/include/graph.h b/src/include/graph.h
index a4dba5c14e..0c912ebebe 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -40,8 +40,10 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
 #define NCCL_TOPO_CPU_ARCH_ARM 3
 #define NCCL_TOPO_CPU_VENDOR_INTEL 1
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_TYPE_BDW 1
 #define NCCL_TOPO_CPU_TYPE_SKL 2
+#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 

From e055229e569a459a3888145b9cb9160f230c1efc Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 8 Dec 2020 16:51:34 -0500
Subject: [PATCH 04/25] Revert "Changes to topology based on XGMI (#272)"

This reverts commit 01bd2573db12e5360ea3022eadbc8e8c0b555048.
---
 src/graph/paths.cc  | 91 +--------------------------------------------
 src/graph/search.cc | 64 ++++++++++++++++---------------
 src/graph/topo.h    |  2 +-
 3 files changed, 35 insertions(+), 122 deletions(-)

diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index f8d8e89c91..8f96627f67 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -30,84 +30,6 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
   return ncclInternalError;
 }
 
-// [RCCL]
-// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
-// topology system, which corresponds to how XGMI hardware operates
-static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
-{
-  // Compute paths to GPU g
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
-
-    if (baseNode->paths[baseNode->type] == NULL) {
-      NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
-    }
-
-    // breadth-first search to set all paths to that node in the system
-    struct ncclTopoNodeList nodeList;
-    struct ncclTopoNodeList nextNodeList;
-    nodeList.count = 1; nodeList.list[0] = baseNode;
-    nextNodeList.count = 0;
-    struct ncclTopoLinkList* basePath;
-    NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
-    basePath->count = 0;
-    basePath->width = LOC_WIDTH;
-    basePath->type = PATH_LOC;
-
-    while (nodeList.count) {
-      nextNodeList.count = 0;
-      for (int n=0; n<nodeList.count; n++) {
-        struct ncclTopoNode* node = nodeList.list[n];
-        struct ncclTopoLinkList* path;
-        NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
-        for (int l=0; l<node->nlinks; l++) {
-          struct ncclTopoLink* link = node->links+l;
-          struct ncclTopoNode* remNode = link->remNode;
-
-          // Skip non-XGMI links
-          if (link->type != LINK_NVL) continue;
-
-          if (remNode->paths[baseNode->type] == NULL) {
-            NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
-          }
-
-          struct ncclTopoLinkList* remPath;
-          NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-          float width = std::min(path->width, link->width);
-          if (remPath->width < width) {
-            // Find reverse link
-            for (int l=0; l<remNode->nlinks; l++) {
-              if (remNode->links[l].remNode == node) {
-                remPath->list[0] = remNode->links+l;
-                break;
-              }
-            }
-            if (remPath->list[0] == NULL) {
-              WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
-                   remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
-              return ncclInternalError;
-            }
-            // Copy the rest of the path
-            for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
-            remPath->count = path->count + 1;
-            remPath->width = width;
-            remPath->type = PATH_NVL;
-
-            // Add to the list for the next iteration if not already in the list
-            // In this case, permit GPUs are intermediate XGMI steps
-            for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
-            nextNodeList.list[nextNodeList.count++] = remNode;
-          }
-        }
-      }
-      memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
-    }
-  }
-  return ncclSuccess;
-}
-// [/RCCL]
-
-
 static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
   if (baseNode->paths[baseNode->type] == NULL) {
     NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
         struct ncclTopoLinkList* remPath;
         NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
         float width = std::min(path->width, link->width);
-
-        // [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
-        //        Unless they are of shorter length
-     // if (remPath->width < width) {
-        bool notXGMI = remPath->type != PATH_NVL;
-        if (remPath->width < width && notXGMI) {
-        // [/RCCL]
-
+        if (remPath->width < width) {
           // Find reverse link
           for (int l=0; l<remNode->nlinks; l++) {
             if (remNode->links[l].remNode == node) {
@@ -451,10 +366,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
   }
 
-  // [RCCL] Add XGMI-only links between GPUs first before any other paths
-  NCCLCHECK(ncclTopoSetXgmi(system));
-  // [/RCCL]
-
   // Set direct paths from/to GPUs.
   for (int g=0; g<system->nodes[GPU].count; g++) {
     // Compute paths to GPU g
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 6a0c824b64..56bbd4ca9b 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -159,9 +159,6 @@ struct ncclGpuScore {
   int intraNhops;
   int intraWidth;
   int interNhops;
-  // [RCCL]
-  int intraType;     // New sort parameter to favor XGMI
-  // [/RCCL]
   int interPciWidth;
   int interWidth;    // Most important
 };
@@ -172,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
    int d;
    if ((d = (s2->interWidth - s1->interWidth))) return d;
    if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
-   // [RCCL]
-   if ((d = (s1->intraType - s2->intraType))) return d;   // Prefer XGMI over any other types
-   // [/RCCL]
    if ((d = (s1->interNhops - s2->interNhops))) return d;
    if ((d = (s2->intraWidth - s1->intraWidth))) return d;
    if ((d = (s1->intraNhops - s2->intraNhops))) return d;
@@ -234,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
   for (int i=1; i<ngpus; i++) {
     int g = (start+i)%ngpus;
     if (paths[g].count == 0) continue; // There is no path to that GPU
-    // [RCCL] - Prune earlier for performance
-    {
-      if (paths[g].type > graph->typeIntra) continue;         // Skip if the intra path type is already slower than the current target
-      if (paths[g].width < graph->speedIntra) continue;
-      if (netPaths)
-      {
-        if (netPaths[g].type > graph->typeInter) continue;    // Skip if the inter path type is already slower than the current target
-        if (netPaths[g].width < graph->speedInter) continue;
-      }
-    }
-    // [/RCCL]
     if (system->nodes[GPU].nodes[g].used & flag) continue;
     scores[count].g = g;
     scores[count].startIndex = i;
     scores[count].intraNhops = paths[g].count;
     scores[count].intraWidth = paths[g].width;
-    // [RCCL] - Add path type as sort factor
-    scores[count].intraType = paths[g].type;
-    // [/RCCL]
     if (netPaths) {
       scores[count].interNhops = netPaths[g].count;
       scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
@@ -313,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
+static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int count = 0;
+  for (int c=0; c<graph->nChannels; c++) {
+    for (int i=0; i<ngpus; i++) {
+      int g = graph->intra[ngpus*c+i];
+      int n = graph->intra[ngpus*c+((i+1)%ngpus)];
+      struct ncclTopoNode *node;
+      int j;
+      for (j=0; j<ngpus; j++)
+        if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
+      if (j<ngpus) {
+        node = system->nodes[GPU].nodes+j;
+        for (int k = 0; k<system->nodes[GPU].count; k++) {
+          if (node->paths[GPU][k].count == 1) {
+            struct ncclTopoLink* link = node->paths[GPU][k].list[0];
+            struct ncclTopoNode* remNode = link->remNode;
+            if (remNode->gpu.rank == n) {
+              if (link->type == LINK_NVL)
+                count ++;
+            }
+          }
+        }
+      }
+    }
+  }
+  return count;
+}
+
 ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
   // 1. Constraint to get the same nChannels between Rings and Trees
   if (graph->nChannels < graph->minChannels) return ncclSuccess;
@@ -326,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
   // 3. Less hops (but not at the price of going cross NICs)
   if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
 
+  // 4. Prefer graph with more XGMI connections
+  if (graph->nChannels == refGraph->nChannels
+    && ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
   return ncclSuccess;
 }
 
@@ -451,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           if (paths[i].count < paths[f].count) f = i;
         int t = 1 << 10;
         NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
-        // [RCCL] Event if forced order PCI is found, continue the search instead of ending early
-        // if (t == -1) *time = -1;
-        // [/RCCL]
+        if (t == -1) *time = -1;
       }
 
       // Then try the most local GPUs
@@ -536,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
     ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
   } else {
     // Intra-node only.
-    // [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
-    {
-      for (int g=0; g<system->nodes[GPU].count; g++) {
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
-      }
-      return ncclSuccess;
-    }
-    // [/RCCL]
     if (graph->nChannels == 0) {
       // Try PCI order first
       NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 11f44f6325..9379844bf2 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -23,7 +23,7 @@
 #define P9_WIDTH 32.0
 #define ARM_WIDTH 6.0
 #define NET_WIDTH 12.0           // 100Gbit
-#define VEGA_XGMI_WIDTH 20.0
+#define VEGA_XGMI_WIDTH 24.0
 
 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.

From 105db19a11dfe1617398fccfd7b75ac722e41769 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 8 Dec 2020 17:35:07 -0500
Subject: [PATCH 05/25] Do not allow GPU as intermediate

---
 src/graph/paths.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 8f96627f67..381d263b44 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -87,14 +87,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
           // Consider a path going through the CPU as PATH_PHB
           if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
           // Set 1 hop NVLink as NVB
-          if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
+          //if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
 
           remPath->type = std::max(path->type, type);
 
           // Add to the list for the next iteration if not already in the list
-          int i;
-          for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
-          if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+          // Disallow GPUs as intermediate steps for now
+          if (remNode->type != GPU) {
+            int i;
+            for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+            if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+          }
         }
       }
     }

From f4d5d3d62017cabbcbd4327cf5e4d0482707db0b Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 4 Dec 2020 18:52:32 -0500
Subject: [PATCH 06/25] Port alltoall[v]

---
 CMakeLists.txt                        |   2 +
 src/channel.cc                        |   2 +
 src/collectives/all_to_all_api.cc     |   9 +-
 src/collectives/all_to_allv_api.cc    |   9 +-
 src/collectives/device/all_to_all.cu  |   3 +-
 src/collectives/device/all_to_all.h   | 135 +++++++++++------------
 src/collectives/device/all_to_allv.cu |   3 +-
 src/collectives/device/all_to_allv.h  | 150 +++++++++++++-------------
 src/collectives/device/common.h       |  10 +-
 src/collectives/device/gather.cu      |  12 ---
 src/collectives/device/gather.h       |  74 -------------
 src/collectives/device/scatter.cu     |  12 ---
 src/collectives/device/scatter.h      |  74 -------------
 src/collectives/gather_api.cc         |   9 --
 src/collectives/scatter_api.cc        |   9 --
 src/enqueue.cc                        |  68 ++++++++++--
 src/graph/paths.cc                    |  15 ++-
 src/group.cc                          |   4 +-
 src/include/collectives.h             |  16 ++-
 src/include/devcomm.h                 |  10 +-
 src/include/info.h                    |   8 +-
 src/init.cc                           |  35 ++++--
 src/misc/argcheck.cc                  |   9 +-
 src/proxy.cc                          |  10 +-
 tools/topo_expl/utils.cpp             |  27 ++++-
 25 files changed, 332 insertions(+), 383 deletions(-)
 mode change 100755 => 100644 src/collectives/device/all_to_allv.cu
 mode change 100755 => 100644 src/collectives/device/all_to_allv.h
 delete mode 100644 src/collectives/device/gather.cu
 delete mode 100644 src/collectives/device/gather.h
 delete mode 100644 src/collectives/device/scatter.cu
 delete mode 100644 src/collectives/device/scatter.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 328185535c..c37283c129 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,8 @@ set(CU_SOURCES
     src/collectives/device/broadcast.cu
     src/collectives/device/reduce_scatter.cu
     src/collectives/device/sendrecv.cu
+    src/collectives/device/all_to_all.cu
+    src/collectives/device/all_to_allv.cu
     src/collectives/device/functions.cu)
 
 set(CPP_SOURCES)
diff --git a/src/channel.cc b/src/channel.cc
index 2a4be474bb..f4c969898c 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -27,12 +27,14 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
 
   // Per-channel operation list.
   NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
+  NCCLCHECK(ncclCudaHostCalloc(&channel->a2avParams, comm->nRanks*NCCL_MAX_OPS*4));
   return ncclSuccess;
 }
 
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   if (channel->id == -1) return ncclSuccess;
   // Operation list
+  NCCLCHECK(ncclCudaHostFree(channel->a2avParams));
   NCCLCHECK(ncclCudaHostFree(channel->workFifo));
 
   // Free Ring index to rank tables
diff --git a/src/collectives/all_to_all_api.cc b/src/collectives/all_to_all_api.cc
index 5320ff5efd..9ecc1c47e9 100644
--- a/src/collectives/all_to_all_api.cc
+++ b/src/collectives/all_to_all_api.cc
@@ -25,10 +25,9 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
     NCCLCHECK(ncclGroupEnd());
     return ncclSuccess;
   } else {
-    //struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
-    //  sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
-    //  ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
-    //return ncclEnqueueCheck(&info);
-    return ncclInternalError;
+    struct ncclInfo info = { ncclFuncAllToAll, "AllToAll",
+      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+      ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+    return ncclEnqueueCheck(&info);
   }
 }
diff --git a/src/collectives/all_to_allv_api.cc b/src/collectives/all_to_allv_api.cc
index cd7151f9e8..ca405fbf5b 100755
--- a/src/collectives/all_to_allv_api.cc
+++ b/src/collectives/all_to_allv_api.cc
@@ -37,10 +37,9 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
     NCCLCHECK(ncclGroupEnd());
     return ncclSuccess;
   } else {
-    //struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
-    //  sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
-    //  ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
-    //return ncclEnqueueCheck(&info);
-    return ncclInternalError;
+    struct ncclInfo info = { ncclFuncAllToAllv, "AllToAllv",
+      sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
+      ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
+    return ncclEnqueueCheck(&info);
   }
 }
diff --git a/src/collectives/device/all_to_all.cu b/src/collectives/device/all_to_all.cu
index 5a1314276a..46d0486d67 100644
--- a/src/collectives/device/all_to_all.cu
+++ b/src/collectives/device/all_to_all.cu
@@ -8,5 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_FUNC(ncclAllToAll, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclAllToAll, copy, FuncSum, i8, int8_t, 0);
+IMPL_COLL_FUNC(AllToAll, RING, SIMPLE, Sum, int8_t);
diff --git a/src/collectives/device/all_to_all.h b/src/collectives/device/all_to_all.h
index 9b986cdc13..05927a2e8a 100644
--- a/src/collectives/device/all_to_all.h
+++ b/src/collectives/device/all_to_all.h
@@ -9,80 +9,81 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllToAllKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->coll.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->coll.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllToAll, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+  __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int nChannels = args->coll.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclRing* ring = &channel->ring;
+    const ssize_t size = args->coll.count;
+    const int nranks = comm->nRanks;
+    const int bid = args->coll.bid;
+    const int rank = ring->devUserRanks[0];
+    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+    const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
+    const int peersPerChan = DIVUP(nranks, nChannels);
+    const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      for (int i = 0; i < peersPerChan; i++) {
+        if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
+          (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
+          continue;
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+        int nelem = min(realChunkSize, size-chunkOffset);
+        if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
+          if (tid < nthreads && thisInput != thisOutput) {
+            const T* sendbuff = thisInput+chunkOffset+rank*size;
+            T* recvbuff = thisOutput+chunkOffset+rank*size;
+            // local copy
+            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
+          }
+        }
+      }
+    }
 
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     for (int i = 0; i < peersPerChan; i++) {
       if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
         (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
         continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
-        if (tid < nthreads && thisInput != thisOutput) {
-          const T* sendbuff = thisInput+chunkOffset+rank*size;
-          T* recvbuff = thisOutput+chunkOffset+rank*size;
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
+      if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
+        int nthreadsSplit = nthreads/2;
+        if (tid < nthreadsSplit ) {
+          int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
+          ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 0, 1, 0, FUNC>
+            prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+            ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+            int nelem = min(realChunkSize, size-chunkOffset);
+            ssize_t send_offset = chunkOffset + peerSend*size;
+            prims.send(thisInput+send_offset, nelem);
+          }
+        } else {
+          int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
+          ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 0, 0, FUNC>
+            prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+            ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+            int nelem = min(realChunkSize, size-chunkOffset);
+            ssize_t recv_offset = chunkOffset + peerRecv*size;
+            prims.recv(thisOutput+recv_offset, nelem);
+          }
         }
       }
     }
   }
-
-  for (int i = 0; i < peersPerChan; i++) {
-    if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-      (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-      continue;
-    if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
-      int nthreadsSplit = nthreads/2;
-      int peerNone[2] = {-1,-1};
-      if (tid < nthreadsSplit ) {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 2, 1, 0, FUNC>
-          prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-          ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-          ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-          int nelem = min(realChunkSize, size-chunkOffset);
-          ssize_t send_offset = chunkOffset + peerSend*size;
-          prims.send(thisInput+send_offset, nelem);
-        }
-      } else {
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 2, 0, FUNC>
-          prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-          ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-          ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-          int nelem = min(realChunkSize, size-chunkOffset);
-          ssize_t recv_offset = chunkOffset + peerRecv*size;
-          prims.recv(thisOutput+recv_offset, nelem);
-        }
-      }
-    }
-  }
-}
+};
\ No newline at end of file
diff --git a/src/collectives/device/all_to_allv.cu b/src/collectives/device/all_to_allv.cu
old mode 100755
new mode 100644
index f78042c939..49d883c530
--- a/src/collectives/device/all_to_allv.cu
+++ b/src/collectives/device/all_to_allv.cu
@@ -8,5 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_FUNC(ncclAllToAllv, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclAllToAllv, copy, FuncSum, i8, int8_t, 0);
+IMPL_COLL_FUNC(AllToAllv, RING, SIMPLE, Sum, int8_t);
diff --git a/src/collectives/device/all_to_allv.h b/src/collectives/device/all_to_allv.h
old mode 100755
new mode 100644
index c1bc3cf0d2..dffa2e1d60
--- a/src/collectives/device/all_to_allv.h
+++ b/src/collectives/device/all_to_allv.h
@@ -9,87 +9,89 @@
 #include "primitives.h"
 #include "collectives.h"
 
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclAllToAllvKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->a2av.nThreads;
-  const int nChannels = args->a2av.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t typesize = args->a2av.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->a2av.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllToAllv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+  public:
+  __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
+    const int tid = threadIdx.x;
+    const int nthreads = args->nThreads;
+    const int nChannels = args->a2av.nChannels;
+    struct ncclDevComm* comm = args->comm;
+    struct ncclChannel* channel = comm->channels+blockIdx.x;
+    struct ncclRing* ring = &channel->ring;
+    const ssize_t typesize = args->a2av.count;
+    const int nranks = comm->nRanks;
+    const int bid = args->a2av.bid;
+    const int rank = ring->devUserRanks[0];
+    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+    const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
+    const int peersPerChan = DIVUP(nranks, nChannels);
+    const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
 
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
+    // Compute pointers
+    const T * __restrict__ thisInput = (const T*)args->sendbuff;
+    T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+    size_t* params = channel->a2avParams + nranks*4*args->index;
+    size_t *sendcounts = params;
+    size_t *sdispls = params + nranks;
+    size_t *recvcounts = params + nranks*2;
+    size_t *rdispls = params + nranks*3;
+    ssize_t size = sendcounts[rank]*typesize;
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      for (int i = 0; i < peersPerChan; i++) {
+        if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
+          (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
+          continue;
+        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+        int nelem = min(realChunkSize, size-chunkOffset);
+        if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
+          if (tid < nthreads && thisInput != thisOutput) {
+            const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
+            T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
+            // local copy
+            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
+          }
+        }
+      }
+    }
 
-  size_t *sendcounts = args->a2av.extra;
-  size_t *sdispls = args->a2av.extra + nranks;
-  size_t *recvcounts = args->a2av.extra + nranks*2;
-  size_t *rdispls = args->a2av.extra + nranks*3;
-  ssize_t size = sendcounts[rank]*typesize;
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     for (int i = 0; i < peersPerChan; i++) {
       if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
         (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
         continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
-        if (tid < nthreads && thisInput != thisOutput) {
-          const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
-          T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
+      if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
+        int nthreadsSplit = nthreads/2;
+        if (tid < nthreadsSplit ) {
+          int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
+          ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 0, 1, 0, FUNC>
+            prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+          size = sendcounts[peerSend]*typesize;
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+            ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
+            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+            int nelem = min(realChunkSize, size-chunkOffset);
+            ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
+            prims.send(thisInput+send_offset, nelem);
+          }
+        } else {
+          int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
+          ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 0, 0, FUNC>
+            prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
+          size = recvcounts[peerRecv]*typesize;
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
+            ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
+            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
+            int nelem = min(realChunkSize, size-chunkOffset);
+            ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
+            prims.recv(thisOutput+recv_offset, nelem);
+          }
         }
       }
     }
   }
-
-  for (int i = 0; i < peersPerChan; i++) {
-    if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-      (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-      continue;
-    if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
-      int nthreadsSplit = nthreads/2;
-      int peerNone[2] = {-1,-1};
-      if (tid < nthreadsSplit ) {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 2, 1, 0, FUNC>
-          prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
-        size = sendcounts[peerSend]*typesize;
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-          ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
-          ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-          int nelem = min(realChunkSize, size-chunkOffset);
-          ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
-          prims.send(thisInput+send_offset, nelem);
-        }
-      } else {
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 2, 0, FUNC>
-          prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
-        size = recvcounts[peerRecv]*typesize;
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-          ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
-          ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-          int nelem = min(realChunkSize, size-chunkOffset);
-          ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
-          prims.recv(thisOutput+recv_offset, nelem);
-        }
-      }
-    }
-  }
-}
+};
\ No newline at end of file
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 9c54eae4ef..9ce25fa656 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -77,7 +77,9 @@
   NCCL_FUNCS2B(AllGather), \
   NCCL_FUNCS2A(ReduceScatter), \
   NCCL_FUNCS2A(AllReduce), \
-  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) }
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \
+  NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \
+  NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) }
 
 // Must be consistent with the ncclFuncSet enum
 using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
@@ -93,6 +95,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
   NCCL_FUNCS2A(ReduceScatter),
   NCCL_FUNCS2A(AllReduce),
   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+  NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t),
+  NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t),
 #endif
 };
 
@@ -140,7 +144,9 @@ void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
     else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
   }
   else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
-  else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
+  else if (c->funcIndex == 1800) ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
+  else if (c->funcIndex == 1801) ncclFunction_AllToAll_RING_SIMPLE_Sum_int8_t(c);
+  else if (c->funcIndex == 1802) ncclFunction_AllToAllv_RING_SIMPLE_Sum_int8_t(c);
 }
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
diff --git a/src/collectives/device/gather.cu b/src/collectives/device/gather.cu
deleted file mode 100644
index 5aca1afdaa..0000000000
--- a/src/collectives/device/gather.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "gather.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_FUNC(ncclGather, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclGather, copy, FuncSum, i8, int8_t, 0);
diff --git a/src/collectives/device/gather.h b/src/collectives/device/gather.h
deleted file mode 100644
index c704e39421..0000000000
--- a/src/collectives/device/gather.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclGatherKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->coll.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->coll.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * GATHER_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-  const int root = args->coll.root;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
-        const T* sendbuff = thisInput+chunkOffset;
-        T* recvbuff = thisOutput+chunkOffset+rank*size;
-        if (tid < nthreads && sendbuff != recvbuff) {
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-        }
-      }
-      else {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        if (rank == root) {
-          ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
-            prims(tid, nthreads, &peerRecv, &peerSend, NULL, stepSize, channel, comm);
-
-          ssize_t recv_offset = chunkOffset + peerRecv*size;
-          prims.recv(thisOutput+recv_offset, nelem);
-        }
-        else {
-          if (peerSend == root) {
-            ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
-              prims(tid, nthreads, &peerRecv, &peerSend, NULL, stepSize, channel, comm);
-
-            ssize_t send_offset = chunkOffset;
-            prims.send(thisInput+send_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/collectives/device/scatter.cu b/src/collectives/device/scatter.cu
deleted file mode 100644
index 6da04434f3..0000000000
--- a/src/collectives/device/scatter.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "scatter.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_FUNC(ncclScatter, copy, FuncSum, i8, int8_t);
-IMPL_COLL_KERN(ncclScatter, copy, FuncSum, i8, int8_t, 0);
diff --git a/src/collectives/device/scatter.h b/src/collectives/device/scatter.h
deleted file mode 100644
index f08544068a..0000000000
--- a/src/collectives/device/scatter.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclScatterKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->coll.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->coll.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * SCATTER_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-  const int root = args->coll.root;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
-        const T* sendbuff = thisInput+chunkOffset+rank*size;
-        T* recvbuff = thisOutput+chunkOffset;
-        if (tid < nthreads && sendbuff != recvbuff) {
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-        }
-      }
-      else {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        if (rank == root) {
-          ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-            prims(tid, nthreads, &peerRecv, &peerSend, NULL, stepSize, channel, comm);
-
-          ssize_t send_offset = chunkOffset + peerSend*size;
-          prims.send(thisInput+send_offset, nelem);
-        }
-        else {
-          if (peerRecv == root) {
-            ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-              prims(tid, nthreads, &peerRecv, &peerSend, NULL, stepSize, channel, comm);
-
-            ssize_t recv_offset = chunkOffset;
-            prims.recv(thisOutput+recv_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/collectives/gather_api.cc b/src/collectives/gather_api.cc
index f24e38b887..8b53538933 100644
--- a/src/collectives/gather_api.cc
+++ b/src/collectives/gather_api.cc
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t
     ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
-  if (comm->alltoallDisable) {
     int nRanks;
     NCCLCHECK(ncclCommCount(comm, &nRanks));
     size_t rankOffset = sendcount * ncclTypeSize(datatype);
@@ -27,12 +26,4 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
     NCCLCHECK(ncclGroupEnd());
     return ncclSuccess;
-  }
-  else {
-    //struct ncclInfo info = { ncclCollGather, "Gather",
-    //  sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
-    //  GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
-    //return ncclEnqueueCheck(&info);
-    return ncclInternalError;
-  }
 }
diff --git a/src/collectives/scatter_api.cc b/src/collectives/scatter_api.cc
index b573294b00..2a0bc20df4 100644
--- a/src/collectives/scatter_api.cc
+++ b/src/collectives/scatter_api.cc
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t
     ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
     ncclComm_t comm, hipStream_t stream) {
-  if (comm->alltoallDisable) {
     int nRanks;
     NCCLCHECK(ncclCommCount(comm, &nRanks));
     size_t rankOffset = recvcount * ncclTypeSize(datatype);
@@ -27,12 +26,4 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
     NCCLCHECK(ncclGroupEnd());
     return ncclSuccess;
-  }
-  else {
-    //struct ncclInfo info = { ncclCollScatter, "Scatter",
-    //  sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
-    //  SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
-    //return ncclEnqueueCheck(&info);
-    return ncclInternalError;
-  }
 }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 937d1a8f92..e32b519974 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -8,6 +8,7 @@
 #include "enqueue.h"
 #include "argcheck.h"
 #include "coll_net.h"
+#include "graph/topo.h"
 
 // Only generate inline kernels for LL
 #define NCCL_FUNC5(func, algo, redop, dtype) \
@@ -107,7 +108,7 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
 
 static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
   // Only launch blocks where we have work to do.
-  for (int c=0; c<comm->p2pnChannels; c++) {
+  for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
     if (comm->channels[c].workCount) params->gridDim.x = c+1;
   }
 
@@ -241,7 +242,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
     max = std::max(max, channel->workFifoTail);
     channel->workCount = 0;
   }
-  for (int r=0; r<comm->p2pnChannels; r++) {
+  for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
     struct ncclChannel* channel = comm->channels+r;
     channel->workFifoTail = max;
   }
@@ -272,6 +273,13 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
   float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
   // Find algorithm / protocol.
+  if (info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) {
+    info->algorithm = NCCL_ALGO_RING;
+    info->protocol = NCCL_PROTO_SIMPLE;
+    info->nChannels = comm->nChannels;
+    info->nThreads = NCCL_MAX_NTHREADS;
+    return ncclSuccess;
+  }
   info->algorithm = -1;
   info->protocol = -1;
   int nAlgos = NCCL_NUM_ALGORITHMS;
@@ -332,6 +340,9 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
       info->pattern = ncclPatternRing; break;
     case ncclFuncAllReduce:
       info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+    case ncclFuncAllToAll:
+    case ncclFuncAllToAllv:
+      info->pattern = ncclPatternAll; break;
     default:
       WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
       return ncclInternalError;
@@ -353,6 +364,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
       info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
     case ncclPatternRingTwice:
       info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternAll:
+      info->nstepsPerLoop = 1;
+      info->nchunksPerLoop = info->comm->nRanks; break;
     default:
       WARN("Unknown pattern %d\n", info->pattern);
       return ncclInternalError;
@@ -368,12 +382,21 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
+  if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv)
+    && info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
+    info->nChannels =info->comm->p2pnChannels;
+
   work->opCount = info->comm->opCount;
   work->sendbuff = info->sendbuff;
   work->recvbuff = info->recvbuff;
-  work->coll.root = info->root;
-  work->coll.count = info->count;
-  work->coll.nChannels = info->nChannels;
+  if (info->coll == ncclFuncAllToAllv)  {
+    work->a2av.count = info->count;
+    work->a2av.nChannels = info->nChannels;
+  } else {
+    work->coll.root = info->root;
+    work->coll.count = info->count;
+    work->coll.nChannels = info->nChannels;
+  }
   work->nThreads = info->nThreads;
 
   work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
@@ -481,8 +504,21 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
     if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
 
     info->comm->myParams->gridDim.x++;
-    work.coll.bid = bid % nChannels;
-    NCCLCHECK(getNextOp(channel, NULL, &work));
+    if (info->coll == ncclFuncAllToAllv) {
+      work.a2av.bid = bid % work.a2av.nChannels;
+    } else {
+      work.coll.bid = bid % nChannels;
+    }
+    struct ncclWork* w;
+    NCCLCHECK(getNextOp(channel, &w, &work));
+    if (info->coll == ncclFuncAllToAllv) {
+      struct ncclWorkElem* e = w->elems;
+      size_t* params = channel->a2avParams + info->comm->nRanks*4*e->index;
+      memcpy(params, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
+      memcpy(params+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
+      memcpy(params+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
+      memcpy(params+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
+    }
   }
   info->comm->opCount++;
   return ncclSuccess;
@@ -565,7 +601,9 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
 }
 
 static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
-  for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
+  const int e = (info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
+    ? 1 : NCCL_MAX_WORK_ELEMENTS;
+  for (int s=0; s<e && work->elems[s].p2p.delta != info->delta; s++) {
     if (work->elems[s].p2p.nThreads == 0) return s;
   }
   return -1;
@@ -632,7 +670,12 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
     NCCLCHECKGOTO(checkSetStream(info), ret, end);
 
-    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    if (info->coll == ncclFuncAllToAllv)
+      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
+        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
+        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+    else if (info->coll != ncclFuncSendRecv)
+      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
@@ -653,7 +696,12 @@ end:
     NCCLCHECK(ArgsCheck(info));
     NCCLCHECK(checkSetStream(info));
 
-    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    if (info->coll == ncclFuncAllToAllv)
+      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
+        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
+        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+    else
+      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 381d263b44..567043d4f6 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -519,12 +519,19 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     }
   }
 
-  // Round to next pow2 nChannelsPerPeer and nChannels
-  comm->p2pnChannelsPerPeer = nextPow2(minChannels);
-  comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
+    // Adjust P2P channels on Rome
+    comm->p2pnChannelsPerPeer = 2;
+    comm->p2pnChannels = 2;
+  }
+  else {
+    // Round to next pow2 nChannelsPerPeer and nChannels
+    comm->p2pnChannelsPerPeer = nextPow2(minChannels);
+    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  }
 
   // Init channels that weren't used so far
-  for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
+  for (int c=comm->nChannels; c<std::max(comm->nChannels, comm->p2pnChannels); c++) NCCLCHECK(initChannel(comm, c));
 
   // We want to spread channels used when there aren't many and progressively
   // fill the whole space of nChannels. To do so we mirror the bits in the
diff --git a/src/group.cc b/src/group.cc
index 8572cda7a7..df9cd85ddd 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -226,9 +226,9 @@ ncclResult_t ncclGroupEnd() {
       int nChannelsMax = comm->p2pnChannelsPerPeer;
       int nChannelsMin = nChannelsMax;
       // Try to use all channels, but one channel per operation.
-      while (nChannelsMin*comm->nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
+      while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
       // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
-      while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+      while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
 
       while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
         // schedule delta 0, +1, -1, +2, -2, ...
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 05e8b9ee5f..6b905385fd 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -8,8 +8,10 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-#define FUNC_INDEX_P2P 1800
-#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
+#define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
+#define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \
+  ? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
+  : ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
 
 #define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
   ncclFunction_##func##_##algo##_##proto##_##redop##_##type
@@ -60,6 +62,8 @@
   DECL(ReduceScatter) \
   DECL(AllReduce) \
   DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
+  DECL5(AllToAll, RING, SIMPLE, Sum, int8_t) \
+  DECL5(AllToAllv, RING, SIMPLE, Sum, int8_t) \
 
 DECL_ALL
 
@@ -81,4 +85,12 @@ DECL_ALL
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define SENDRECV_SLICEFACTOR 1
+#define GATHER_SLICESTEPS 4
+#define GATHER_CHUNKSTEPS 4
+#define SCATTER_SLICESTEPS 4
+#define SCATTER_CHUNKSTEPS 4
+#define ALLTOALL_SLICESTEPS 4
+#define ALLTOALL_CHUNKSTEPS 4
+#define ALLTOALLV_SLICESTEPS 4
+#define ALLTOALLV_CHUNKSTEPS 4
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index f64e3094f0..341ddb2862 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -23,8 +23,8 @@
 #endif
 
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
-extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t;
+extern const char* ncclFuncStr[];
 
 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
@@ -183,6 +183,11 @@ struct ncclWorkElem {
       int32_t delta;
       uint16_t nThreads;
     } p2p;
+    struct {
+      size_t count;
+      uint8_t bid;
+      uint8_t nChannels;
+    } a2av;
     uint64_t align[3];
   };
 };
@@ -208,6 +213,7 @@ struct ncclChannel {
       struct ncclWork* workFifo;
       int workCount;
       uint64_t workFifoTail; // Only used by CPU
+      size_t* a2avParams;
 
 #ifdef ENABLE_PROFILING
       struct timeval tvs;
diff --git a/src/include/info.h b/src/include/info.h
index 6b71492bd1..fa9402f350 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -20,7 +20,8 @@ typedef enum {
   ncclPatternTreeDown,
   ncclPatternTreeUpDown,
   ncclPatternCollTreeUp,
-  ncclPatternCollTreeDown
+  ncclPatternCollTreeDown,
+  ncclPatternAll
 } ncclPattern_t;
 
 // Used to pass NCCL call information between functions
@@ -39,6 +40,11 @@ struct ncclInfo {
   // Algorithm details
   int chunkSteps;
   int sliceSteps;
+  // For alltoallv
+  const size_t *sendcounts;
+  const size_t *sdispls;
+  const size_t *recvcounts;
+  const size_t *rdispls;
   // Computed later
   int algorithm;
   int protocol;
diff --git a/src/init.cc b/src/init.cc
index 9a8a53618a..734b992f89 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -41,7 +41,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll", "AllToAllv" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
@@ -395,7 +395,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
 
   comm->alltoallDisable = true;
-  //if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;
+  if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;
 
   *comret = comm;
   return ncclSuccess;
@@ -403,11 +403,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
 static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Duplicate the channels on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
 
   // Copy userRanks and peers
-  for (int r=0; r<comm->p2pnChannels; r++) {
+  for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
   }
 
@@ -838,7 +838,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
   allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
   allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
-  allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
+  allGather3Data[rank].alltoallDisable = comm->topo->nodes[NET].count? 1 : comm->alltoallDisable;
 
   allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
     std::min(treeGraph.nChannels, ringGraph.nChannels);
@@ -1031,6 +1031,29 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
+  if (!alltoallDisable) {
+    int nc = comm->p2pnChannels;
+    for (int c=0; c<nc; c++) {
+      const int peersPerChan = DIVUP(nranks, nc);
+      for (int p=0; p<peersPerChan; p++) {
+        // first channel is reserved for self copy
+        if ((c*peersPerChan+p)%nranks == 0)
+          continue;
+        int peerSend = (rank+c*peersPerChan+p)%nranks;
+        int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
+        if (comm->channels[c].peers[peerSend].send.connected == 0) {
+          comm->connectSend[peerSend] |= (1<<c);
+          comm->connect = 1;
+        }
+        if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
+          comm->connectRecv[peerRecv] |= (1<<c);
+          comm->connect = 1;
+        }
+      }
+    }
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL));
+  }
+
   NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 716cd53408..741a3d6141 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -46,11 +46,16 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
   info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAll) {
     info->count = info->nBytes;
     info->datatype = ncclInt8;
   }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+  if (info->coll == ncclFuncAllToAllv) {
+    // Use count to store data type size for alltoallv
+    info->count = ncclTypeSize(info->datatype);
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllToAll) info->nBytes *= info->comm->nRanks; // count is per rank
 
   if (info->op < 0 || info->op >= ncclNumOps) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
diff --git a/src/proxy.cc b/src/proxy.cc
index b2434b19c7..2ba7aabfb4 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -425,19 +425,19 @@ ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* si
 
   char* buff;
   int* used;
-  *size = 2*comm->p2pnChannels*state->slotSize*state->nslots;
+  *size = 2*std::max(comm->nChannels, comm->p2pnChannels)*state->slotSize*state->nslots;
 
   if (cuda && state->cudaBuff[0] == NULL) {
     NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda));
-    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
-    for (int i=0; i<2*comm->p2pnChannels; i++) {
+    NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots));
+    for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) {
       state->cudaBuff[i] = buff + state->nslots*state->slotSize*i;
       state->cudaUsed[i] = used + state->nslots*i;
     }
   } else if (state->hostBuff[0] == NULL) {
     NCCLCHECK(ncclCudaHostCalloc(&buff, *size));
-    NCCLCHECK(ncclCalloc(&used, 2*comm->p2pnChannels*state->nslots));
-    for (int i=0; i<2*comm->p2pnChannels; i++) {
+    NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots));
+    for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) {
       state->hostBuff[i] = buff + state->nslots*state->slotSize*i;
       state->hostUsed[i] = used + state->nslots*i;
     }
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index 45d2723475..ca216eee81 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -30,7 +30,7 @@
 #include "model.h"
 #include "utils.h"
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
@@ -282,7 +282,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
   allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
   allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
-  allGather3Data[rank].alltoallDisable = comm->alltoallDisable;
+  allGather3Data[rank].alltoallDisable = comm->topo->nodes[NET].count? 1 : comm->alltoallDisable;
 
   allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
     std::min(treeGraph.nChannels, ringGraph.nChannels);
@@ -624,6 +624,29 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
+  if (!alltoallDisable) {
+    int nc = comm->nChannels;
+    for (int c=0; c<nc; c++) {
+      const int peersPerChan = DIVUP(nranks, nc);
+      for (int p=0; p<peersPerChan; p++) {
+        // first channel is reserved for self copy
+        if ((c*peersPerChan+p)%nranks == 0)
+          continue;
+        int peerSend = (rank+c*peersPerChan+p)%nranks;
+        int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
+        if (comm->channels[c].peers[peerSend].send.connected == 0) {
+          comm->connectSend[peerSend] |= (1<<c);
+          comm->connect = 1;
+        }
+        if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
+          comm->connectRecv[peerRecv] |= (1<<c);
+          comm->connect = 1;
+        }
+      }
+    }
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL));
+  }
+
   //NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   //if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));

From b33a2cac8b0aee63f8fb284f1cffbce03b10b30a Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Fri, 13 Nov 2020 17:57:44 -0800
Subject: [PATCH 07/25] gtest: add scatter to combined calls and use loops
 (#303)

* gtest: add scatter to combined calls and use loops

* gtest: run validation inside loop

* gtest: revert small element count to 2520

* gtest: fix memory leak in validation

(cherry picked from commit b0853ccd516cee04879e890a897967bbaa368375)

* Fix combined call UT

* Fix memory leak

* Fix alltoallv test
---
 test/CorrectnessTest.hpp     |  3 +-
 test/test_AllGather.hpp      |  2 +
 test/test_AllReduceAbort.cpp |  1 +
 test/test_AllToAllv.cpp      | 10 ++---
 test/test_BroadcastAbort.cpp |  1 +
 test/test_CombinedCalls.cpp  | 77 ++++++++++++++++++++++--------------
 test/test_Gather.cpp         |  2 +-
 test/test_Scatter.cpp        |  2 +-
 8 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp
index 95010ee701..3c7d6722b9 100644
--- a/test/CorrectnessTest.hpp
+++ b/test/CorrectnessTest.hpp
@@ -121,7 +121,7 @@ namespace CorrectnessTests
         // Explicit memory release to avoid double-free from subDatasets
         void Release()
         {
-            for (int i = 0; i < outputs.size(); i++)
+            for (int i = 0; i < numDevices; i++)
             {
                 if (!inPlace) hipFree(outputs[i]);
                 hipFree(inputs[i]);
@@ -443,6 +443,7 @@ namespace CorrectnessTests
                 }
                 ASSERT_EQ(isMatch, true);
             }
+            free(outputI1);
         }
 
         // Passed in parameters from TestTuple
diff --git a/test/test_AllGather.hpp b/test/test_AllGather.hpp
index 615616f36d..06a0297c3d 100644
--- a/test/test_AllGather.hpp
+++ b/test/test_AllGather.hpp
@@ -25,6 +25,8 @@ namespace CorrectnessTests
 
             for (int i = 0; i < dataset.numDevices; i++)
                 memcpy(dataset.expected[i], result, dataset.NumBytes());
+
+            free(result);
         }
     };
 }
diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp
index ba63ea9004..23d7129814 100644
--- a/test/test_AllReduceAbort.cpp
+++ b/test/test_AllReduceAbort.cpp
@@ -114,6 +114,7 @@ namespace CorrectnessTests
             if (idle) pthread_yield();
         }
 
+        free(done);
         HIPCHECK(hipHostFree(fake_head));
         HIPCHECK(hipStreamDestroy(stream));
         dataset.Release();
diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp
index 6f2e317f44..fe78b0f352 100644
--- a/test/test_AllToAllv.cpp
+++ b/test/test_AllToAllv.cpp
@@ -22,7 +22,7 @@ namespace CorrectnessTests
 
         size_t chunksize = numElements*2/numDevices;
         #define MAX_ALLTOALLV_RANKS 16
-        static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS];
+        static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
         // Launch the reduction (1 thread per GPU)
         ncclGroupStart();
         for (int r = 0; r < numDevices; r++) {
@@ -31,12 +31,12 @@ namespace CorrectnessTests
                 size_t scount = ((i+r)%numDevices)*chunksize;
                 if (i+r == numDevices-1)
                   scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2);
-                sendcounts[i] = recvcounts[i] = scount;
-                sdispls[i] = rdispls[i] = disp;
+                sendcounts[i+r*MAX_ALLTOALLV_RANKS] = recvcounts[i+r*MAX_ALLTOALLV_RANKS] = scount;
+                sdispls[i+r*MAX_ALLTOALLV_RANKS] = rdispls[i+r*MAX_ALLTOALLV_RANKS] = disp;
                 disp += scount;
             }
-            ncclAllToAllv((char*)dataset.inputs[r], sendcounts, sdispls,
-              (char*)dataset.outputs[r], recvcounts, rdispls, dataType, comms[r], streams[r]);
+            ncclAllToAllv((char*)dataset.inputs[r], sendcounts+r*MAX_ALLTOALLV_RANKS, sdispls+r*MAX_ALLTOALLV_RANKS,
+              (char*)dataset.outputs[r], recvcounts+r*MAX_ALLTOALLV_RANKS, rdispls+r*MAX_ALLTOALLV_RANKS, dataType, comms[r], streams[r]);
         }
         ncclGroupEnd();
         // Wait for reduction to complete
diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp
index 96f1de6ccf..ddab18a8e9 100644
--- a/test/test_BroadcastAbort.cpp
+++ b/test/test_BroadcastAbort.cpp
@@ -116,6 +116,7 @@ namespace CorrectnessTests
             if (idle) pthread_yield();
         }
 
+        free(done);
         HIPCHECK(hipHostFree(fake_head));
         HIPCHECK(hipStreamDestroy(stream));
         dataset.Release();
diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index db4f938fc0..e7be7385a1 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -10,6 +10,7 @@
 #include "test_Broadcast.hpp"
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"
+#include "test_Scatter.hpp"
 
 namespace CorrectnessTests
 {
@@ -25,6 +26,10 @@ namespace CorrectnessTests
             FillDatasetWithPattern(datasets[i]);
         }
 
+        Dataset scatter_dataset;
+        scatter_dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
+        FillDatasetWithPattern(scatter_dataset);
+
         // Compute expected results for each dataset in combined
         int const root = 0;
         AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
@@ -32,46 +37,60 @@ namespace CorrectnessTests
         BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
         ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
         ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
+        ScatterCorrectnessTest::ComputeExpectedResults(scatter_dataset, root);
 
         size_t const byteCount = datasets[0].NumBytes() / numDevices;
         size_t const elemCount = numElements / numDevices;
 
-        ncclGroupStart();
-        for (int i = 0; i < numDevices; i++)
+        for (int j = 0; j < 10; j++)
         {
-            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
-                          datasets[0].outputs[i], elemCount,
-                          dataType, comms[i], streams[i]);
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclScatter(scatter_dataset.inputs[i],
+                              scatter_dataset.outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
+            }
+            ncclGroupEnd();
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
+                              datasets[0].outputs[i], elemCount,
+                              dataType, comms[i], streams[i]);
 
-            ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
-                          numElements, dataType, op, comms[i], streams[i]);
+                ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
+                              numElements, dataType, op, comms[i], streams[i]);
 
-            ncclBroadcast(datasets[2].inputs[i],
-                          datasets[2].outputs[i],
-                          numElements, dataType,
-                          root, comms[i], streams[i]);
+                ncclBroadcast(datasets[2].inputs[i],
+                              datasets[2].outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
 
-            ncclReduce(datasets[3].inputs[i],
-                       datasets[3].outputs[i],
-                       numElements, dataType, op,
-                       root, comms[i], streams[i]);
+                ncclReduce(datasets[3].inputs[i],
+                           datasets[3].outputs[i],
+                           numElements, dataType, op,
+                           root, comms[i], streams[i]);
 
-            ncclReduceScatter(datasets[4].inputs[i],
-                              (int8_t *)datasets[4].outputs[i] + (i * byteCount),
-                              elemCount, dataType, op,
-                              comms[i], streams[i]);
+                ncclReduceScatter(datasets[4].inputs[i],
+                                  (int8_t *)datasets[4].outputs[i] + (i * byteCount),
+                                  elemCount, dataType, op,
+                                  comms[i], streams[i]);
+            }
+            ncclGroupEnd();
+            // Wait for reduction to complete
+            Synchronize();
+            // Check results for each collective in the combined
+            for (int i = 0; i < 5; i++)
+                ValidateResults(datasets[i]);
+
+            ValidateResults(scatter_dataset);
         }
-        ncclGroupEnd();
 
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results for each collective in the combined
         for (int i = 0; i < 5; i++)
-        {
-            ValidateResults(datasets[i]);
             datasets[i].Release();
-        }
+        scatter_dataset.Release();
     }
 
     INSTANTIATE_TEST_SUITE_P(CombinedCallsCorrectnessSweep,
@@ -95,7 +114,7 @@ namespace CorrectnessTests
                                 // Number of devices
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values(false),
+                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_Gather.cpp b/test/test_Gather.cpp
index 8169127966..bf917c739d 100644
--- a/test/test_Gather.cpp
+++ b/test/test_Gather.cpp
@@ -66,6 +66,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_Scatter.cpp b/test/test_Scatter.cpp
index 78cfa01ec3..aaca748217 100644
--- a/test/test_Scatter.cpp
+++ b/test/test_Scatter.cpp
@@ -66,6 +66,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace

From 2ddbe6646b3a6613bd9ca1ae3e3e9b2a14c46fe6 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 22 Dec 2020 13:28:21 -0500
Subject: [PATCH 08/25] Improve collective trace

---
 src/collectives/device/common.h | 24 +++++++++----
 src/include/collectives.h       |  3 ++
 src/include/devcomm.h           | 13 ++++++-
 src/init.cc                     | 63 +++++++++++++++++++++++++--------
 tools/topo_expl/utils.cpp       |  2 +-
 5 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 9ce25fa656..d96e2e307b 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -117,6 +117,8 @@ struct Caller<f, f + 1>{
   void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
 };
 
+static_assert(FUNC_INDEX_P2P == 1800, "Wrong P2P function index");
+
 inline
 __device__
 void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
@@ -177,7 +179,19 @@ class ncclFunction {
     comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
     comm->collTrace[pos].opCount = w->opCount; \
     comm->collTrace[pos].bid = bid; \
-    comm->collTrace[pos].funcIndex = fIdx;
+    comm->collTrace[pos].funcIndex = fIdx; \
+    if (fIdx == FUNC_INDEX_P2P) { \
+      comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \
+      comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \
+    } else if (fIdx == FUNC_INDEX_A2AV) { \
+      comm->collTrace[pos].coll.nThreads = w->nThreads; \
+      comm->collTrace[pos].coll.bid = w->a2av.bid; \
+      comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \
+    } else { \
+      comm->collTrace[pos].coll.nThreads = w->nThreads; \
+      comm->collTrace[pos].coll.bid = w->coll.bid; \
+      comm->collTrace[pos].coll.nChannels = w->coll.nChannels; \
+    }
 #define traceKernelLaunch(fIdx)  { \
     traceColl(fIdx); \
     asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
@@ -262,7 +276,7 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {
     if (w == NULL) {
       w = shmem.localWork.elems;
       if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
-        if (COLLTRACE && tid == 0) traceAbort(-1);
+        if (COLLTRACE && tid == 0) traceAbort(0xffff);
         return;
       }
       if (COLLTRACE && tid == 0) {
@@ -270,11 +284,9 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {
         if (!firstLaunch) traceCollEnd(w->funcIndex);
         firstLaunch = false;
       }
-    } else {
-      if (COLLTRACE && tid == 0) {
+    } else if (COLLTRACE && tid == 0) {
         traceKernelLaunch(w->funcIndex);
         firstLaunch = false;
-      }
     }
     if (tid < w->nThreads) {
       if (w->funcIndex == FINDEX) {
@@ -285,7 +297,7 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {
     }
     index = (index+1) % NCCL_MAX_OPS;
     if (w->active == 2) {
-      if (COLLTRACE && tid == 0) traceCollEnd(-1);
+      if (COLLTRACE && tid == 0) traceCollEnd(0xffff);
       return;
     }
     w = NULL;
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 6b905385fd..eef7a3cd16 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -9,6 +9,9 @@
 #define NCCL_COLLECTIVES_H_
 
 #define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
+#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1)
+#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2)
+
 #define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \
   ? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
   : ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 341ddb2862..7fa6842b8d 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -286,7 +286,18 @@ struct ncclCollTrace {
   uint32_t data_0;
   uint64_t timeStamp;
   uint64_t opCount;
-  uint64_t data_1;
+  union {
+    uint64_t data_1;
+    struct {
+      uint16_t nThreads;
+      uint8_t bid;
+      uint8_t nChannels;
+    } coll;
+    struct {
+      uint16_t nThreads;
+      uint16_t delta;
+    } p2p;
+  };
 };
 static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
 
diff --git a/src/init.cc b/src/init.cc
index 734b992f89..421ceeda07 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -41,9 +41,11 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll", "AllToAllv" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
+const char* ncclRedOpStr[ncclNumOps] = { "Sum", "Prod", "Max", "Min" };
+const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"};
 
 NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
@@ -161,6 +163,25 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
 void *ncclCommThreadMain(void *arg) {
   ncclComm_t comm = (ncclComm_t)arg;
   int head = comm->hostDevComm.collTraceHead;
+  #define MAX_NAME_LENGTH 32
+  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_A2AV+1));
+  for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
+    for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) {
+      for (int type = 0; type < ncclNumTypes; type++) {
+        for (int pr = 0; pr < NCCL_NUM_PROTOCOLS; pr++) {
+          for (int redop = 0; redop < ncclNumOps; redop++) {
+            char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX(func, redop, type, al, pr);
+            sprintf(line, "%s%s%s%s%s", ncclFuncStr[func], ncclAlgoStr[al], ncclProtoStr[pr],
+              ncclRedOpStr[redop], ncclTypeStr[type]);
+          }
+        }
+      }
+    }
+  }
+  for (int func = NCCL_NUM_FUNCTIONS; func < NCCL_NUM_FUNCTIONS+3; func++) {
+    char* line = func_names+MAX_NAME_LENGTH*(FUNC_INDEX_P2P+func-NCCL_NUM_FUNCTIONS);
+    sprintf(line, "%s", ncclFuncStr[func]);
+  }
   do {
     int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
     int count;
@@ -177,32 +198,45 @@ void *ncclCommThreadMain(void *arg) {
       }
     }
     for (int i = 0; i < count; i++) {
-      uint8_t type = LOAD(&(comm->hostDevComm.collTrace[head].type));
+      struct ncclCollTrace *td = comm->hostDevComm.collTrace+head;
+      uint8_t type = LOAD(&(td->type));
       if (type == ncclCollTraceNotReady)
         break;
       char line[1024];
       int offset = 0;
+      uint16_t fIdx = td->funcIndex;
       #define VEGA_GPU_RTC_FREQUENCY 2.5E7
       if (type == ncclCollTraceDataType) {
         sprintf(line, "## [%12.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
-          (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid,
-          comm->hostDevComm.collTrace[head].funcIndex,
-          comm->hostDevComm.collTrace[head].data_0,
-          comm->hostDevComm.collTrace[head].opCount,
-          comm->hostDevComm.collTrace[head].data_1);
+          (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid,
+          fIdx, td->data_0, td->opCount, td->data_1);
       } else {
         sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
-          (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount);
+          (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount);
         offset = strlen(line);
         switch (type) {
           case ncclCollTraceKernelLaunchType:
-            sprintf(line+offset, " KL hwid %8x funcIndex %d",
-              comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex);
+            sprintf(line+offset, " KL HWID %8x %s ",
+              td->data_0, func_names+MAX_NAME_LENGTH*fIdx);
+            offset = strlen(line);
+            if (fIdx > FUNC_INDEX_A2AV)
+              sprintf(line+offset, "ERROR bad function index %d", fIdx);
+            else if (fIdx == FUNC_INDEX_P2P)
+              sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
+            else
+              sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
             break;
           case ncclCollTraceCollEndType:
-            if (comm->hostDevComm.collTrace[head].funcIndex != -1)
-              sprintf(line+offset, " CE next funcIndex %d",
-                comm->hostDevComm.collTrace[head].funcIndex);
+            if (fIdx != 0xffff) {
+              sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx);
+              offset = strlen(line);
+              if (fIdx > FUNC_INDEX_A2AV)
+                sprintf(line+offset, "ERROR bad function index %d", fIdx);
+              else if (fIdx == FUNC_INDEX_P2P)
+                sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
+              else
+                sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
+            }
             else
               sprintf(line+offset, " KE");
             break;
@@ -215,11 +249,12 @@ void *ncclCommThreadMain(void *arg) {
         }
       }
       INFO(NCCL_COLL, "%s", line);
-      STORE(&(comm->hostDevComm.collTrace[head].type), ncclCollTraceNotReady);
+      STORE(&(td->type), ncclCollTraceNotReady);
       head ++;
       head %= COLLTRACE_NUM_ITEMS;
     }
   } while(1);
+  free(func_names);
   comm->hostDevComm.collTraceHead = head;
   pthread_exit(NULL);
 }
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index ca216eee81..e5bcf768fd 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -30,7 +30,7 @@
 #include "model.h"
 #include "utils.h"
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 

From 3e62ceddc50a6c17ca7a4525d40c4bacd35054a6 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Thu, 28 Jan 2021 09:45:01 -0700
Subject: [PATCH 09/25] Clique kernel support (#295) (#15)

* Adding experimental clique-based kernels (opt-in only)

Co-authored-by: Stanley Tsang <stanley.tsang@amd.com>
Co-authored-by: Gilbert Lee <gilbert.lee@amd.com>
Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>

Co-authored-by: Stanley Tsang <stanley.tsang@amd.com>
Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
---
 CHANGELOG.md                           |   9 +
 CMakeLists.txt                         |   6 +
 src/bootstrap.cc                       |  41 +-
 src/clique/AllReduceCliqueKernel.h     |  75 ++++
 src/clique/CliqueCommon.h              |  93 +++++
 src/clique/CliqueManager.cc            | 517 +++++++++++++++++++++++++
 src/clique/CliqueManager.h             | 128 ++++++
 src/clique/CliqueShmNames.h            |  37 ++
 src/clique/HandleCache.cc              |  31 ++
 src/clique/HandleCache.h               | 142 +++++++
 src/clique/HandleShm.cc                |  67 ++++
 src/clique/HandleShm.h                 |  53 +++
 src/clique/Hash.cc                     |  34 ++
 src/clique/Hash.h                      |  28 ++
 src/clique/MsgQueue.cc                 |  72 ++++
 src/clique/MsgQueue.h                  |  42 ++
 src/clique/SharedMemHelper.h           |  43 ++
 src/clique/ShmObject.cc                |  45 +++
 src/clique/ShmObject.h                 | 210 ++++++++++
 src/collectives/device/all_reduce.cu   |   7 +-
 src/collectives/device/all_reduce.h    |  80 +---
 src/collectives/device/common.h        |  69 +++-
 src/collectives/device/common_kernel.h |   4 +
 src/enqueue.cc                         |  51 ++-
 src/include/bootstrap.h                |   2 +-
 src/include/comm.h                     |  11 +-
 src/include/devcomm.h                  |  18 +-
 src/init.cc                            |  60 ++-
 test/CorrectnessTest.hpp               |  19 +-
 test/test_AllReduce.cpp                |   2 +-
 test/test_CombinedCalls.cpp            |   6 +-
 test/test_GroupCalls.cpp               |   2 +-
 tools/HelloRccl/HelloRccl.cpp          | 257 ++++++++++++
 tools/HelloRccl/HelloRccl.hpp          |  49 +++
 tools/HelloRccl/Makefile               |  21 +
 tools/HelloRccl/runTest.sh             |  22 ++
 36 files changed, 2246 insertions(+), 107 deletions(-)
 create mode 100644 src/clique/AllReduceCliqueKernel.h
 create mode 100644 src/clique/CliqueCommon.h
 create mode 100644 src/clique/CliqueManager.cc
 create mode 100644 src/clique/CliqueManager.h
 create mode 100644 src/clique/CliqueShmNames.h
 create mode 100644 src/clique/HandleCache.cc
 create mode 100644 src/clique/HandleCache.h
 create mode 100644 src/clique/HandleShm.cc
 create mode 100644 src/clique/HandleShm.h
 create mode 100644 src/clique/Hash.cc
 create mode 100644 src/clique/Hash.h
 create mode 100644 src/clique/MsgQueue.cc
 create mode 100644 src/clique/MsgQueue.h
 create mode 100644 src/clique/SharedMemHelper.h
 create mode 100644 src/clique/ShmObject.cc
 create mode 100644 src/clique/ShmObject.h
 create mode 100644 tools/HelloRccl/HelloRccl.cpp
 create mode 100644 tools/HelloRccl/HelloRccl.hpp
 create mode 100644 tools/HelloRccl/Makefile
 create mode 100755 tools/HelloRccl/runTest.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60276a7b01..2388e7c640 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,8 +3,17 @@
 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
 
 ## [Unreleased]
+### Added
+- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
+- Clique-based kernels may offer better performance for smaller input sizes
+- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
 ### Optimizations
 - Performance improvements for Rome-based systems
+### Known issues
+- Clique-based kernels are currently experimental and have not been fully tested on all topologies.  By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
+- Clique-based kernels may hang if there are differences between environment variables set across ranks.
+- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
+
 
 ## [RCCL-2.7.8 for ROCm 3.9.0]
 ### Added
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c37283c129..bf201ac014 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,6 +124,12 @@ set(CC_SOURCES
     src/collectives/all_to_all_api.cc
     src/collectives/all_to_allv_api.cc
     src/channel.cc
+    src/clique/CliqueManager.cc     # RCCL
+    src/clique/HandleCache.cc       # RCCL
+    src/clique/HandleShm.cc         # RCCL
+    src/clique/Hash.cc              # RCCL
+    src/clique/MsgQueue.cc          # RCCL
+    src/clique/ShmObject.cc         # RCCL
     src/misc/argcheck.cc
     src/misc/nvmlwrap_stub.cc
     src/misc/utils.cc
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index a54ef1d003..9bf8d6840d 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,6 +13,11 @@
 #include "socket.h"
 #include <unistd.h>
 #include <sys/types.h>
+// [RCCL]
+#include "clique/CliqueManager.h"
+#include "clique/CliqueShmNames.h"
+#include "clique/Hash.h"
+// [/RCCL]
 
 /* Init functions */
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
@@ -96,10 +102,19 @@ static ncclResult_t setFilesLimit() {
   return ncclSuccess;
 }
 
-static void *bootstrapRoot(void* args) {
-  int listenFd = (uint64_t)args;
+static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
+
+  // [RCCL] Unpack bootstrapRootStruct
+  struct bootstrapRootStruct rootStruct = *(struct bootstrapRootStruct*)bootstrapRootStruct;
+  int listenFd = rootStruct.listenFd;
+  unsigned long hash = rootStruct.hash;
+  int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
+  free(bootstrapRootStruct);
+  // [/RCCL]
+
   ncclResult_t res = ncclSuccess;
   int nranks = 0, c = 0;
+
   struct extInfo info;
   union socketAddress *rankAddresses = NULL;
   union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
@@ -140,12 +155,20 @@ static void *bootstrapRoot(void* args) {
   } while (c < nranks);
   TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
 
+  { // [RCCL] Initialize message queues / shared memory files
+    NCCLCHECKGOTO(CliqueManager::BootstrapRootInit(pid, hash), res, out);
+  } // [/RCCL]
+
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
+
     int tmpSendFd;
     NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
     NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
+    { // [RCCL] Send the root pid for shared file naming
+      NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, &pid, sizeof(int)), res, out);
+    } // [/RCCL]
     close(tmpSendFd);
   }
   TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
@@ -165,7 +188,14 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
   int listenFd;
   NCCLCHECK(createListenSocket(&listenFd, connectAddr));
   pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
+
+  // [RCCL] Use the ncclUniqueId to get a hash for bootstrap
+  struct bootstrapRootStruct* rootStruct = new struct bootstrapRootStruct;
+  rootStruct->hash = djb2Hash(id->internal);
+  rootStruct->listenFd = listenFd;
+  pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
+  // [/RCCL]
+
   return ncclSuccess;
 }
 
@@ -319,7 +349,7 @@ ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
+ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
   struct extState* state;
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
@@ -359,6 +389,9 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
   union socketAddress extAddressNext;
   NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
   NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
+  { // [RCCL] Receive PID from root
+    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, rootPid, sizeof(int)));
+  } // [/RCCL]
   close(tmpRecvFd);
   close(extListenFdRoot);
 
diff --git a/src/clique/AllReduceCliqueKernel.h b/src/clique/AllReduceCliqueKernel.h
new file mode 100644
index 0000000000..291bc37f33
--- /dev/null
+++ b/src/clique/AllReduceCliqueKernel.h
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ALLREDUCECLIQUEKERNEL_H
+#define ALLREDUCECLIQUEKERNEL_H
+
+#include "CliqueCommon.h"
+#include "devcomm.h"
+#include "reduce_kernel.h"
+#include "common_kernel.h"
+
+template <class FUNC, typename T, int NUM_RANKS>
+__device__ void AllReduceCliqueSplitKernel(struct ncclWorkElem* args)
+{
+  // Clique-specific kernel arguments
+  cliqueDevicePtrs_t* cliquePtrs = args->clique.ptrs;      // Collection of all input/output pointers across ranks in clique
+  size_t const N                 = args->clique.count;     // Total number of elements to reduce
+  int    const nBlocks           = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
+  int    const blockId           = args->clique.bid;       // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
+  int    const rank              = args->comm->rank;       // Current rank
+
+  // Each threadblock works independently of others on a subsection of the input
+  // First split evently across ranks, while maintaining multiples of blocksize
+  size_t const perRankN       = RoundUp((N + NUM_RANKS - 1) / NUM_RANKS, blockDim.x);
+  size_t const perBlockN      = RoundUp((perRankN + nBlocks - 1) / nBlocks, blockDim.x);
+  size_t const currBlockStart = min((rank * nBlocks + blockId) * perBlockN, N);
+  size_t const currBlockStop  = min(currBlockStart + perBlockN, N);
+  size_t const blockN         = currBlockStop - currBlockStart;
+
+  if (blockN > 0)
+  {
+    // Prepare input / output subarrays
+    T const** inputs  = (T const**)cliquePtrs->inputs;
+    T**       outputs = (T      **)cliquePtrs->outputs;
+    T const* srcs[NUM_RANKS];
+    T*       dsts[NUM_RANKS];
+
+    #pragma unroll
+    for (int r = 0; r < NUM_RANKS; r++)
+    {
+      srcs[r] = inputs[r]  + currBlockStart;
+      dsts[r] = outputs[r] + currBlockStart;
+    }
+
+    // Perform the reduction
+    #define ALL_REDUCE_CLIQUE_UNROLL 1
+    ReduceOrCopyMulti<ALL_REDUCE_CLIQUE_UNROLL, FUNC, T, NUM_RANKS, NUM_RANKS, NUM_RANKS, NUM_RANKS>(
+      threadIdx.x, blockDim.x, NUM_RANKS, srcs, NUM_RANKS, dsts, blockN);
+  }
+
+  // Even if there was nothing for this GPU to do, it must participate in a barrier
+  // because other GPUs may be modifying this GPUs output buffer still
+  if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
+}
+
+#endif
diff --git a/src/clique/CliqueCommon.h b/src/clique/CliqueCommon.h
new file mode 100644
index 0000000000..13fd5a92ed
--- /dev/null
+++ b/src/clique/CliqueCommon.h
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef CLIQUE_COMMON_H
+#define CLIQUE_COMMON_H
+
+#include "nccl.h"
+#include <cstdint>
+
+#define MIN_CLIQUE_SIZE 2
+#define MAX_CLIQUE_SIZE 8
+
+typedef struct
+{
+  int* globalCount;   // Shared across GPUs
+  int* globalSense;   // Shared across GPUs
+  int* localSense;    // Local to this GPU
+} gpuBarrier_t;
+
+typedef struct
+{
+  // Input/output pointers from participating ranks
+  void const* inputs[MAX_CLIQUE_SIZE];
+  void*       outputs[MAX_CLIQUE_SIZE];
+
+  // Barrier variable
+  gpuBarrier_t barrier;
+} cliqueDevicePtrs_t;
+
+// Helper macro to launch an appropriate kernel by converting rank to a template argument
+#define LAUNCH_CLIQUE_KERNEL(kernelname, FUNC, T, args)  \
+  {                                                      \
+    switch (args->comm->nRanks){                         \
+    case 2: kernelname<FUNC, T, 2>(args); break;         \
+    case 3: kernelname<FUNC, T, 3>(args); break;         \
+    case 4: kernelname<FUNC, T, 4>(args); break;         \
+    case 5: kernelname<FUNC, T, 5>(args); break;         \
+    case 6: kernelname<FUNC, T, 6>(args); break;         \
+    case 7: kernelname<FUNC, T, 7>(args); break;         \
+    case 8: kernelname<FUNC, T, 8>(args); break;         \
+    }                                                    \
+  }
+
+// Multi-GPU (on same node) barrier.  One thread per grid per GPU updates barrier / waits
+template <int NUM_RANKS>
+__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
+{
+  if (threadIdx.x == 0)
+  {
+    // Sense inversion barrier
+    *barrier.localSense = 1 - *barrier.localSense;
+    int localSense = *barrier.localSense;
+
+    int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
+    if (val == NUM_RANKS)
+    {
+      // Last arrival resets barrier
+      __atomic_store_n(barrier.globalCount, 0, __ATOMIC_SEQ_CST);
+      __atomic_store_n(barrier.globalSense, localSense, __ATOMIC_SEQ_CST);
+    }
+    else
+    {
+      // Wait for all ranks to reach barrier
+      while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
+    }
+  }
+}
+
+__forceinline__ __host__ __device__ size_t RoundUp(size_t X, size_t Y)
+{
+  return (X+Y-1)/Y * Y;
+}
+
+#endif
diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc
new file mode 100644
index 0000000000..089e5d030a
--- /dev/null
+++ b/src/clique/CliqueManager.cc
@@ -0,0 +1,517 @@
+/*
+Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "CliqueManager.h"
+#include "CliqueShmNames.h"
+#include "MsgQueue.h"
+
+#include "nccl.h"
+#include "core.h"
+
+#include "Hash.h"
+
+#include "AllReduceCliqueKernel.h"
+
+#include <hip/hip_runtime.h>
+#include <hsa/hsa_ext_amd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS]  = {};
+int*               CliqueManager::m_staticGpuBarrierMem             = NULL;
+
+// Define some environment variables that affect clique-based kernels
+RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0);                                  // Opt-in environment variable for clique-based kernels
+RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152);  // Max number of bytes to use clique-based kernels for all reduce
+RCCL_PARAM(AllReduceNumChannels,     "CLIQUE_ALLREDUCE_NCHANNELS", 4);         // Number of channels to use for all-reduce
+
+CliqueManager::CliqueManager(int          const  rank,
+                             int          const  numRanks,
+                             cliqueMode_t const  cliqueMode) :
+  m_rank(rank),
+  m_numRanks(numRanks),
+  m_cliqueMode(cliqueMode),
+  m_init(false),
+  m_pinnedCliquePtrs(NULL),
+  m_fineGrainBarrierMem(NULL)
+{
+}
+
+CliqueManager::~CliqueManager()
+{
+  if (m_init)
+  {
+    CleanUp();
+  }
+}
+
+void CliqueManager::CleanUp()
+{
+  if (m_cliqueMode == CLIQUE_DISABLED) return;
+
+  // Free variables that are shared between SINGLE_PROCESS / SINGLE_NODE
+  if (m_pinnedCliquePtrs) hipHostFree(m_pinnedCliquePtrs);
+  if (m_gpuBarrierLocalSense) hipFree(m_gpuBarrierLocalSense);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Release caches
+    if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
+    if (m_ipcHandleSendCache) delete m_ipcHandleRecvCache;
+
+    // Close shared memory
+    m_shmHandles.Close();
+    m_sharedCpuMemory.Close();
+    m_sharedIpcHandle.Close();
+
+    if (m_fineGrainBarrierMem)
+    {
+      if (m_rank == 0)
+        hipFree(m_fineGrainBarrierMem);
+      else
+        hipIpcCloseMemHandle(m_fineGrainBarrierMem);
+    }
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    if (m_rank == 0 && m_staticGpuBarrierMem)
+      hipFree(m_staticGpuBarrierMem);
+  }
+  m_init = false;
+}
+
+ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
+{
+  ncclResult_t res;
+
+  if (m_init) return ncclSuccess;
+  m_init = true;
+
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  // Check parameters
+  if (m_rank < 0 || m_rank >= m_numRanks)
+  {
+    WARN("Invalid rank specified.  Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks);
+    return ncclInvalidUsage;
+  }
+  if (commId == NULL)
+  {
+    WARN("CommId should not be empty");
+    return ncclInvalidUsage;
+  }
+
+  // For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var
+  if (!rcclParamEnableClique())
+  {
+    INFO(NCCL_INIT, "Disabling clique-based kernels (did not find env var RCCL_ENABLE_CLIQUE)");
+    m_cliqueMode = CLIQUE_DISABLED;
+    return ncclSuccess;
+  }
+
+  // Allocate pinned CPU memory for holding clique pointers, which kernels will have access to
+  if (hipHostMalloc(&m_pinnedCliquePtrs, sizeof(cliqueDevicePtrs_t) * NCCL_MAX_OPS) != hipSuccess)
+  {
+    WARN("Unable to allocated pinned host memory for clique pointers.  Disabling clique-based kernels");
+    m_cliqueMode = CLIQUE_DISABLED;
+    m_init = true;
+    return ncclSuccess;
+  }
+
+  unsigned long hash = djb2Hash(commId->internal);
+  std::string shmSuffix = std::to_string(hash) + "_" + std::to_string(suffix);
+
+  // Allocate sense barrier variable on local GPU
+  NCCLCHECKGOTO(ncclCudaCalloc(&m_gpuBarrierLocalSense, NCCL_MAX_OPS * sizeof(int)), res, dropback);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Initialize shared memory file for IPC handles (based on commId hash)
+    m_shmHandles = NcclIpcHandleShm(m_rank, m_numRanks, hash, NUM_HANDLES_PER_RANK, NCCL_MAX_OPS, shmSuffix);
+    NCCLCHECKGOTO(m_shmHandles.Open(), res, dropback);
+
+    // Initialize IPC caches
+    m_ipcHandleSendCache = new NcclIpcHandleSendCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS);
+    m_ipcHandleRecvCache = new NcclIpcHandleRecvCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS,
+                                                      100,
+                                                      hipIpcMemHandleHash,
+                                                      hipIpcMemHandleEqual);
+
+    // Initialize shared object for GPU barrier IPC handle
+    m_sharedIpcHandle = ShmObject<hipIpcMemHandle_t>(std::max(4096LU, sizeof(hipIpcMemHandle_t)),
+                                                     CliqueShmNames["Barriers"] + shmSuffix,
+                                                     m_rank,
+                                                     m_numRanks,
+                                                     hash);
+    NCCLCHECKGOTO(m_sharedIpcHandle.Open(), res, dropback);
+
+    if (m_rank == 0)
+    {
+      hipIpcMemHandle_t handle;
+      // Allocate fine-grained device memory on rank 0 and get IPC handle for it
+      // Re-usable barrier consists of (globalCount / globalSense) pair of integers
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
+      if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
+      {
+        WARN("Unable to get IPC handle for barrier memory");
+        goto dropback;
+      }
+      // Write IPC handle to shared memory for other ranks to receive
+      *m_sharedIpcHandle.Get() = handle;
+
+      // Set up global count/sense for first rank
+      m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
+      m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
+    }
+
+    // Initialize shared CPU memory to be used for barrier variables
+    m_sharedCpuMemory = ShmObject<int32_t>(2 * sizeof(int32_t),
+                                           CliqueShmNames["SharedCounters"] + shmSuffix,
+                                           m_rank,
+                                           m_numRanks,
+                                           hash);
+    NCCLCHECKGOTO(m_sharedCpuMemory.Open(), res, dropback);
+
+    // Split up the shared CPU memory for barrier counters / global sense
+    m_cpuBarrierGlobalCount = &m_sharedCpuMemory.Get()[0];
+    m_cpuBarrierGlobalSense = &m_sharedCpuMemory.Get()[1];
+
+    // Initialize CPU barriers
+    if (m_rank == 0)
+    {
+      *m_cpuBarrierGlobalCount = 0;
+      *m_cpuBarrierGlobalSense = 0;
+    }
+    m_cpuBarrierLocalSense = 0;
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    // First rank prepares fine-grained memory shared across ranks used for the two barrier variables
+    if (m_rank == 0)
+    {
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
+      // Prepare all barriers
+      for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
+      {
+        m_staticCliquePtrs[opIndex].barrier.globalCount = &m_staticGpuBarrierMem[opIndex];
+        m_staticCliquePtrs[opIndex].barrier.globalSense = &m_staticGpuBarrierMem[opIndex + NCCL_MAX_OPS];;
+      }
+    }
+  }
+
+
+  m_init = true;
+  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d)", m_cliqueMode);
+  return ncclSuccess;
+
+dropback:
+  // NOTE: This currently assumes that all ranks will fail the same way
+  //       Additional support is required to handle cases when some processes succeed while others fail
+  WARN("Unable to initialize shared memory. Disabling clique-based kernels");
+  CleanUp();
+  m_cliqueMode = CLIQUE_DISABLED;
+  return ncclSuccess;
+}
+
+bool CliqueManager::IsSupported(ncclFunc_t const coll,
+                                size_t const count,
+                                ncclDataType_t const datatype,
+                                ncclRedOp_t const op) const
+{
+  if (m_cliqueMode == CLIQUE_DISABLED) return false;
+
+  // Filter based on total input size for each collective type
+  size_t totalBytes = count * ncclTypeSize(datatype);
+  if (coll == ncclFuncAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
+
+  return false;
+}
+
+ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr)
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  int const opIndex = opCount % NCCL_MAX_OPS;
+
+  // Add opIndex to queue of in-progress collectives
+  m_inProgress.push(opIndex);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Get fine-grained device memory if not already done
+    if (m_fineGrainBarrierMem == NULL)
+    {
+      hipIpcMemHandle_t handle = *m_sharedIpcHandle.Get();
+      CUDACHECK(hipIpcOpenMemHandle((void**)&m_fineGrainBarrierMem, handle, hipIpcMemLazyEnablePeerAccess));
+
+      // Prepare global count/sense barrier variables used the ipc-shared gpu device memory
+      m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
+      m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
+    }
+
+    std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(NUM_HANDLES_PER_RANK);
+
+    // Get IPC handles for input/output pointers from cache
+    NCCLCHECK(CheckCacheForPtr(const_cast<void*>(inputPtr), m_ipcHandleSendCache, m_rank, &handles[0]));
+    NCCLCHECK(CheckCacheForPtr(outputPtr                  , m_ipcHandleSendCache, m_rank, &handles[1]));
+
+    // Prepare barrier pointers (done after the IpcOpenMemory)
+    m_pinnedCliquePtrs[opIndex].barrier.globalCount = &m_gpuBarrierGlobalCount[opIndex];
+    m_pinnedCliquePtrs[opIndex].barrier.globalSense = &m_gpuBarrierGlobalSense[opIndex];
+    m_pinnedCliquePtrs[opIndex].barrier.localSense  = &m_gpuBarrierLocalSense[opIndex];
+
+    // Write IPC handles to shared memory for given rank / opCount
+    NCCLCHECK(m_shmHandles.WriteHandles(opIndex, handles));
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    // Store this rank's input/output pointers into static member
+    m_staticCliquePtrs[opIndex].inputs[m_rank]  = inputPtr;
+    m_staticCliquePtrs[opIndex].outputs[m_rank] = outputPtr;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
+                                                size_t const count,
+                                                ncclDataType_t const datatype,
+                                                ncclRedOp_t const op,
+                                                int const totalNumChannels,
+                                                uint8_t* numChannelstoUse)
+{
+  size_t const totalBytes = count * ncclTypeSize(datatype);
+  *numChannelstoUse = 1;
+
+  if (coll == ncclFuncAllReduce) {
+    *numChannelstoUse = std::min((int)rcclParamAllReduceNumChannels(), totalNumChannels);
+  }
+
+  return ncclSuccess;
+}
+
+
+
+ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args)
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  // Prepare clique argments (NOTE: clique pointers are not ready yet)
+  int opIndex = args->opCount % NCCL_MAX_OPS;
+  args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
+
+  // Determine number of channels to use for this collective
+  args->clique.nChannels = rcclParamAllReduceNumChannels();
+
+  return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::WaitForPointers()
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  // Do nothing if there are no outstanding clique-kernels
+  if (m_inProgress.empty()) return ncclSuccess;
+
+  // Copy clique device pointers to pinned device memory
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Wait for all ranks to arrive
+    WaitForBarrier();
+
+    int numHandles = m_numRanks * NUM_HANDLES_PER_RANK;
+    std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(numHandles);
+
+    while (!m_inProgress.empty())
+    {
+      int const opIndex = m_inProgress.front();
+      m_inProgress.pop();
+
+      // Collect the ready handles from shared memory and convert them to device pointers
+      NCCLCHECK(m_shmHandles.ReadHandles(opIndex, handles));
+      for (int i = 0; i < m_numRanks; i++)
+      {
+        void *input;
+        NCCLCHECK(CheckCacheForHandle(handles[i * NUM_HANDLES_PER_RANK],
+                                      m_ipcHandleRecvCache, &input));
+        m_pinnedCliquePtrs[opIndex].inputs[i] = const_cast<const void *>(input);
+
+        NCCLCHECK(CheckCacheForHandle(handles[(i * NUM_HANDLES_PER_RANK) + 1],
+                                      m_ipcHandleRecvCache, &m_pinnedCliquePtrs[opIndex].outputs[i]));
+      }
+    }
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    while (!m_inProgress.empty())
+    {
+      int const opIndex = m_inProgress.front();
+      m_inProgress.pop();
+
+      // Copy from static memory to pinned host memory and set local sense
+      memcpy(&m_pinnedCliquePtrs[opIndex], &m_staticCliquePtrs[opIndex], sizeof(cliqueDevicePtrs_t));
+      m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
+    }
+  }
+  return ncclSuccess;
+}
+
+std::string HandleToString(hipIpcMemHandle_t handle)
+{
+  char mapping[17] = "0123456789ABCDEF";
+  std::string result;
+  for (int i = 0; i < 4; i++)
+  {
+    unsigned char val = (unsigned char)handle.reserved[i];
+    result += mapping[val / 16];
+    result += mapping[val % 16];
+  }
+  return result;
+}
+
+
+ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
+                                             NcclIpcHandleSendCache* cache,
+                                             int rank,
+                                             std::pair<hipIpcMemHandle_t, size_t>* handlePair)
+{
+  // Get the base address for this device allocation
+  hsa_status_t status;
+  hsa_amd_pointer_info_t info;
+  info.size = sizeof(hsa_amd_pointer_info_t);
+  status = hsa_amd_pointer_info(devPtr, &info, NULL, NULL, NULL);
+  if (status != HSA_STATUS_SUCCESS) {
+    WARN("Uanble to get pointer information for %p", devPtr);
+    return ncclInvalidArgument;
+  }
+
+  // Compute the offset between the device addres and the base address
+  uint64_t baseAddr = (uint64_t)info.agentBaseAddress;
+  uint64_t realAddr = (uint64_t)devPtr;
+  handlePair->second = realAddr - baseAddr;
+
+  // IPC handles are only supported for base address pointers
+  NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);
+
+   if (it == cache->end())
+   {
+     CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
+     cache->insert(baseAddr, handlePair->first);
+   }
+   else
+   {
+     handlePair->first = (it->second).first;
+   }
+   return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
+                                                NcclIpcHandleRecvCache* cache,
+                                                void** ptr)
+{
+  NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);
+
+  // Get base address pointer from cache if it exists
+  void* baseAddr;
+  if (it == cache->end())
+  {
+    CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
+    cache->insert(handlePair.first, baseAddr);
+  }
+  else
+  {
+    baseAddr = (it->second).first;
+  }
+
+  // Modify base address pointer with offset
+  uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
+  *ptr = (void*)realAddr;
+  return ncclSuccess;
+}
+
+void CliqueManager::WaitForBarrier()
+{
+  // Sense inversion barrier
+  m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
+
+  if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
+  {
+    // Reset the barrier
+    STORE(m_cpuBarrierGlobalCount, 0);
+    STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
+  } else {
+    while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
+  }
+}
+
+ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
+{
+  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  {
+    int msgid, fd;
+    std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
+    SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
+    NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
+    SYSCHECK(close(fd), "close");
+  }
+
+  std::string shmDir = "/dev/shm/";
+
+  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  {
+    struct stat fileStatus;
+    std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
+    std::string shmFullPath = shmDir + shmFileName;
+
+    // Check if shm file already exists; if so, unlink it
+    if (stat(shmFullPath.c_str(), &fileStatus) == 0)
+    {
+      NCCLCHECK(shmUnlink(shmFileName.c_str()));
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/clique/CliqueManager.h b/src/clique/CliqueManager.h
new file mode 100644
index 0000000000..b522d9e036
--- /dev/null
+++ b/src/clique/CliqueManager.h
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_CLIQUE_MANAGER_HPP_
+#define RCCL_CLIQUE_MANAGER_HPP_
+
+#include <semaphore.h>
+#include <mutex>
+#include <queue>
+
+#include "nccl.h"
+#include "devcomm.h"
+#include "CliqueCommon.h"
+#include "HandleCache.h"
+#include "HandleShm.h"
+
+#define NUM_HANDLES_PER_RANK 2
+
+class CliqueManager
+{
+public:
+  typedef enum
+  {
+    CLIQUE_DISABLED       = 0,
+    CLIQUE_SINGLE_PROCESS = 1,
+    CLIQUE_SINGLE_NODE    = 2
+  } cliqueMode_t;
+
+  CliqueManager(int const rank, int const numRanks, cliqueMode_t const cliqueMode);
+
+  ~CliqueManager();
+
+  void CleanUp();
+
+  ncclResult_t Init(ncclUniqueId const* commId, int suffix);
+
+  // Returns true if the collective is supported via a clique-based kernel
+  bool IsSupported(ncclFunc_t const coll,
+                   size_t const count,
+                   ncclDataType_t const datatype,
+                   ncclRedOp_t const op) const;
+
+  // Provide the pointers to be exchanged across the clique for the given rank / opCount
+  ncclResult_t DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr);
+
+  // Determine the number of channels / CUs to use for this call
+  ncclResult_t GetNumChannelsToUse(ncclFunc_t const coll,
+                                   size_t const count,
+                                   ncclDataType_t const datatype,
+                                   ncclRedOp_t const op,
+                                   int const totalNumChannels,
+                                   uint8_t* numChannelstoUse);
+
+  // Set pointers for where clique-related arguments will be found
+  // This sets pointers to device-accessible memory where the arguments will eventually reside
+  ncclResult_t SetCliqueArgs(ncclWorkElem* args);
+
+  // Blocking call that only returns after all out-standing clique pointers are ready
+  ncclResult_t WaitForPointers();
+
+  // Prepares shared memory files upon initialization
+  static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
+
+protected:
+  ncclResult_t CheckCacheForPtr(void* devPtr,
+                                NcclIpcHandleSendCache* cache,
+                                int rank,
+                                std::pair<hipIpcMemHandle_t, size_t>* handlePair);
+
+  ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
+                                   NcclIpcHandleRecvCache* cache,
+                                   void** ptr);
+
+  // Race-condition helper functions
+  void WaitForBarrier();
+
+  int                          m_rank;                               // Associated rank
+  int                          m_numRanks;                           // Total number of ranks
+  cliqueMode_t                 m_cliqueMode;                         // Clique mode (off/single process/single node)
+  bool                         m_init;                               // Whether CliqueManager has been initialized
+  cliqueDevicePtrs_t*          m_pinnedCliquePtrs;                   // Pinned-host-memory (device accessible) containing device pointers
+  int*                         m_gpuBarrierGlobalCount;              // Part of GPU barrier (count variable shared across ranks)
+  int*                         m_gpuBarrierGlobalSense;              // Part of GPU barrier (reset variable shared across ranks)
+  int*                         m_gpuBarrierLocalSense;               // Part of GPU barrier (reset variable local to this rank)
+  std::queue<int>              m_inProgress;                         // Queue of clique-based collectives waiting for pointers
+
+  // IPC-related (CLIQUE_SINGLE_NODE)
+  NcclIpcHandleShm             m_shmHandles;                         // Used to exchange IPC handles between ranks
+  NcclIpcHandleSendCache*      m_ipcHandleSendCache;                 // Caches pointers to IPC handles (to send to other processes)
+  NcclIpcHandleRecvCache*      m_ipcHandleRecvCache;                 // Caches IPC handles to pointers (received from other processes)
+  ShmObject<int32_t>           m_sharedCpuMemory;                    // Used to pass shared memory used for CPU barrier
+  ShmObject<hipIpcMemHandle_t> m_sharedIpcHandle;                    // Used to pass fine-grained device memory buffer IPC handle
+  int*                         m_fineGrainBarrierMem;                // Fine-grained GPU memory barrier (allocated only on 1st rank, shared on others)
+  int*                         m_cpuBarrierGlobalCount;              // Part of CPU barrier (count variable shared across ranks)
+  int*                         m_cpuBarrierGlobalSense;              // Part of CPU barrier (reset variable shared across ranks)
+  int                          m_cpuBarrierLocalSense;               // Part of CPU barrier (reset variable local to this rank)
+
+  // Single-process (CLIQUE_SINGLE_PROCESS)
+  static cliqueDevicePtrs_t    m_staticCliquePtrs[NCCL_MAX_OPS];     // Use shared static memory to exchange pointer info
+  static int*                  m_staticGpuBarrierMem;                // Static storage backing for fine-grained gpu barrier
+};
+
+// For use in bootstrapping code
+struct bootstrapRootStruct {
+  int listenFd;
+  unsigned long hash;
+};
+
+#endif
diff --git a/src/clique/CliqueShmNames.h b/src/clique/CliqueShmNames.h
new file mode 100644
index 0000000000..577af8be8d
--- /dev/null
+++ b/src/clique/CliqueShmNames.h
@@ -0,0 +1,37 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_CLIQUE_SHM_NAMES_H_
+#define NCCL_CLIQUE_SHM_NAMES_H_
+
+#include <string>
+#include <map>
+
+static std::map<std::string, std::string> CliqueShmNames =
+{
+    {"SharedCounters", "RcclCounters"  },
+    {"Mutexes"       , "RcclMutexes"   },
+    {"IpcHandles"    , "RcclIpcHandles"},
+    {"Barriers"      , "RcclBarriers"  }
+};
+
+#endif
diff --git a/src/clique/HandleCache.cc b/src/clique/HandleCache.cc
new file mode 100644
index 0000000000..8fe4b6ab4b
--- /dev/null
+++ b/src/clique/HandleCache.cc
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "HandleCache.h"
+
+#include "Hash.h"
+
+// djb2 hash function for hashing char array in hipIpcMemHandle_t
+unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle)
+{
+    return djb2Hash(handle.reserved);
+}
diff --git a/src/clique/HandleCache.h b/src/clique/HandleCache.h
new file mode 100644
index 0000000000..dc479e00e8
--- /dev/null
+++ b/src/clique/HandleCache.h
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_HANDLE_CACHE_H_
+#define NCCL_HANDLE_CACHE_H_
+
+#include <list>
+#include <unordered_map>
+#include <functional>
+
+#include "core.h"
+
+//#include "llvm/ADT/DenseMap.h"
+
+template <
+    class Key,
+    class Value,
+    class Hash,
+    class KeyEqual,
+    class Allocator
+>
+class NcclIpcHandleCache
+{
+public:
+    typedef std::pair<Value, typename std::list<Key>::iterator> NcclIpcHandleCacheValueType;
+    typedef std::unordered_map<Key, NcclIpcHandleCacheValueType, Hash, KeyEqual, Allocator> LRUCache;
+    using iterator = typename LRUCache::iterator;
+    NcclIpcHandleCache(size_t size,
+                       size_t bucket_count = 100,
+                       const Hash& hash = Hash(),
+                       const KeyEqual& eql = KeyEqual(),
+                       const Allocator& alloc = Allocator() ) : m_cache(bucket_count, hash, eql, alloc)
+    {
+        m_capacity = size;
+    }
+
+    ~NcclIpcHandleCache()
+    {
+        m_lruHistory.clear();
+        m_cache.clear();
+    }
+
+    iterator begin()
+    {
+        return m_cache.begin();
+    }
+
+    iterator end()
+    {
+        return m_cache.end();
+    }
+
+    iterator find(const Key& key)
+    {
+        iterator it = m_cache.find(key);
+        if (it != m_cache.end())
+        {
+            updateHistory(it);
+        }
+
+        return it;
+    }
+
+    std::pair<iterator, bool> insert(const Key& key, const Value& value)
+    {
+        if (m_cache.size() == m_capacity)
+        {
+            // remove entry
+            pop();
+        }
+
+        typename LRUCache::iterator it = m_cache.find(key);
+        bool inserted;
+        if (it == m_cache.end())
+        {
+            typename std::list<Key>::iterator it = m_lruHistory.insert(m_lruHistory.end(), key);
+            m_cache.insert(std::make_pair(key, std::make_pair(value, it)));
+            inserted = true;
+        }
+        else
+        {
+            inserted = false;
+        }
+
+        return std::pair<iterator, bool>(it, inserted);
+    }
+
+private:
+    void pop()
+    {
+        typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
+        m_cache.erase(it);
+        m_lruHistory.pop_front();
+    }
+
+    void updateHistory(const iterator& it)
+    {
+        if (m_lruHistory.size() > 0)
+        {
+            m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second);
+        }
+    }
+    size_t m_capacity;
+    std::list<Key> m_lruHistory;
+    LRUCache m_cache;
+};
+
+// djb2 hash function for hashing char array in hipIpcMemHandle_t
+unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle);
+
+// equality function required for unordered_map
+auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle_t& r)
+{
+    return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0;
+};
+
+//typedef llvm::DenseMap<uint64_t, hipIpcMemHandle_t> SendCache;
+//typedef llvm::DenseMap<hipIpcMemHandle_t, void*, decltype(&HandleHash), decltype(HandleEqual)> RecvCache;
+
+typedef NcclIpcHandleCache<uint64_t, hipIpcMemHandle_t, std::hash<uint64_t>, std::equal_to<uint64_t>, std::allocator< std::pair<const uint64_t, std::pair<hipIpcMemHandle_t, std::list<uint64_t>::iterator>>>> NcclIpcHandleSendCache;
+typedef NcclIpcHandleCache<hipIpcMemHandle_t, void*, decltype(&hipIpcMemHandleHash), decltype(hipIpcMemHandleEqual), std::allocator< std::pair<const hipIpcMemHandle_t, std::pair<void*, std::list<hipIpcMemHandle_t>::iterator>>>> NcclIpcHandleRecvCache;
+
+#endif
diff --git a/src/clique/HandleShm.cc b/src/clique/HandleShm.cc
new file mode 100644
index 0000000000..937390cf20
--- /dev/null
+++ b/src/clique/HandleShm.cc
@@ -0,0 +1,67 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip/hip_runtime.h>
+
+#include "HandleShm.h"
+#include "CliqueShmNames.h"
+#include "core.h"
+#include "Hash.h"
+#include "shm.h"
+
+NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) :
+  ShmObject<std::pair<hipIpcMemHandle_t,size_t>>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair<hipIpcMemHandle_t,size_t>),
+                                                 CliqueShmNames["IpcHandles"] + suffix,
+                                                 rank,
+                                                 numRanks,
+                                                 projid),
+  m_numHandlesPerRank(numHandlesPerRank),
+  m_numHandlesPerOpCount(numRanks * numHandlesPerRank)
+{
+}
+
+NcclIpcHandleShm::NcclIpcHandleShm()
+{
+}
+
+NcclIpcHandleShm::~NcclIpcHandleShm()
+{
+}
+
+ncclResult_t NcclIpcHandleShm::Open()
+{
+  return ShmObject::Open();
+}
+
+ncclResult_t NcclIpcHandleShm::WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles)
+{
+  size_t idx = (opCount * m_numHandlesPerOpCount) + (m_rank *  m_numHandlesPerRank);
+  memcpy(m_shmPtr + idx, sendHandles.data(), sizeof(std::pair<hipIpcMemHandle_t,size_t>) * m_numHandlesPerRank);
+  return ncclSuccess;
+}
+
+ncclResult_t NcclIpcHandleShm::ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles)
+{
+  size_t idx = opCount * m_numHandlesPerOpCount;
+  memcpy(recvHandles.data(), m_shmPtr + idx, m_numHandlesPerOpCount * sizeof(std::pair<hipIpcMemHandle_t,ssize_t>));
+  return ncclSuccess;
+}
diff --git a/src/clique/HandleShm.h b/src/clique/HandleShm.h
new file mode 100644
index 0000000000..c681de0eb4
--- /dev/null
+++ b/src/clique/HandleShm.h
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_IPC_HANDLE_SHM_H_
+#define NCCL_IPC_HANDLE_SHM_H_
+
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <string>
+
+#include "nccl.h"
+#include "ShmObject.h"
+
+class NcclIpcHandleShm : public ShmObject<std::pair<hipIpcMemHandle_t,size_t>>
+{
+public:
+    NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix);
+
+    NcclIpcHandleShm();
+
+    ~NcclIpcHandleShm();
+
+    ncclResult_t Open();
+
+    ncclResult_t WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles);
+
+    ncclResult_t ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles);
+
+private:
+    int m_numHandlesPerRank;
+    int m_numHandlesPerOpCount;
+};
+
+#endif
diff --git a/src/clique/Hash.cc b/src/clique/Hash.cc
new file mode 100644
index 0000000000..249c66b329
--- /dev/null
+++ b/src/clique/Hash.cc
@@ -0,0 +1,34 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "Hash.h"
+
+unsigned long djb2Hash(const char* data)
+{
+    unsigned long hash = 5381;
+    int c;
+
+    while ((c = *(data)++))
+        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+
+    return hash;
+}
diff --git a/src/clique/Hash.h b/src/clique/Hash.h
new file mode 100644
index 0000000000..e6cbbaa569
--- /dev/null
+++ b/src/clique/Hash.h
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_HASH_H_
+#define NCCL_HASH_H_
+
+unsigned long djb2Hash(const char* data);
+
+#endif
diff --git a/src/clique/MsgQueue.cc b/src/clique/MsgQueue.cc
new file mode 100644
index 0000000000..716c449e8f
--- /dev/null
+++ b/src/clique/MsgQueue.cc
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "MsgQueue.h"
+
+#include <sys/ipc.h>
+#include <sys/msg.h>
+
+#define MSG_QUEUE_PERM 0666
+
+ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid)
+{
+  key_t key;
+  SYSCHECKVAL(ftok(name.c_str(), projid), "ftok", key);
+  int flag = (exclusive == true ? IPC_CREAT | IPC_EXCL : IPC_CREAT);
+
+  msgid = msgget(key, MSG_QUEUE_PERM | flag);
+  // Check if we're trying to create message queue and it already exists; if so, delete existing queue
+  if (msgid == -1 && exclusive == true && errno == EEXIST)
+  {
+    NCCLCHECK(MsgQueueClose(name, projid));
+    SYSCHECKVAL(msgget(key, MSG_QUEUE_PERM | flag), "msgget", msgid);
+  }
+  else if (msgid == -1)
+  {
+    WARN("Call to MsgQueueGetId failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg)
+{
+  SYSCHECK(msgsnd(msgid, msgp, msgsz, msgflg), "msgsnd");
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait)
+{
+  int msgflg = (wait == false ? IPC_NOWAIT : 0);
+  SYSCHECK(msgrcv(msgid, msgp, msgsz, msgtyp, msgflg), "msgrcv");
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueClose(std::string name, int projid)
+{
+  key_t key;
+  int msgid;
+  key = ftok(name.c_str(), projid);
+  SYSCHECKVAL(msgget(key, IPC_CREAT), "msgget", msgid);
+  SYSCHECK(msgctl(msgid, IPC_RMID, NULL), "msgctl");
+  return ncclSuccess;
+}
diff --git a/src/clique/MsgQueue.h b/src/clique/MsgQueue.h
new file mode 100644
index 0000000000..346208a6e8
--- /dev/null
+++ b/src/clique/MsgQueue.h
@@ -0,0 +1,42 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_MSG_QUEUE_HPP_
+#define RCCL_MSG_QUEUE_HPP_
+
+#include <string>
+
+#include "nccl.h"
+#include "core.h"
+
+struct MsgBuffer
+{
+  long msg_type;
+  char msg_text[1];
+};
+
+ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid);
+ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg);
+ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait);
+ncclResult_t MsgQueueClose(std::string name, int projid);
+
+#endif
diff --git a/src/clique/SharedMemHelper.h b/src/clique/SharedMemHelper.h
new file mode 100644
index 0000000000..6fc269e7fc
--- /dev/null
+++ b/src/clique/SharedMemHelper.h
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef SHAREDMEMHELPER_H
+#define SHAREDMEMHELPER_H
+
+
+class SharedMemHelper
+{
+public:
+  SharedMemHelper(int rank, int numRanks, int numEntries);
+
+  ncclStatus_t Init(std::string const& baseFilename);
+
+  ncclStatus_t
+
+
+protected:
+  bool m_initialized;
+  int m_rank;
+  int m_numRanks;
+};
+
+#endif
diff --git a/src/clique/ShmObject.cc b/src/clique/ShmObject.cc
new file mode 100644
index 0000000000..353779e41d
--- /dev/null
+++ b/src/clique/ShmObject.cc
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "ShmObject.h"
+#include <string>
+
+// Template specializations for sem_t objects which require additional initialization
+template<>
+ncclResult_t ShmObject<sem_t>::Close()
+{
+    size_t numMutexes = m_shmSize / sizeof(sem_t);
+
+    for (size_t i = 0; i < numMutexes; i++)
+    {
+        sem_destroy(static_cast<sem_t*>(&m_shmPtr[i]));
+    }
+
+    int retVal = shm_unlink(m_shmName.c_str());
+    if (retVal == -1 && errno != ENOENT)
+    {
+        WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+    }
+
+    return ncclSuccess;
+}
diff --git a/src/clique/ShmObject.h b/src/clique/ShmObject.h
new file mode 100644
index 0000000000..f458014f5e
--- /dev/null
+++ b/src/clique/ShmObject.h
@@ -0,0 +1,210 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_SHM_OBJECT_H_
+#define NCCL_SHM_OBJECT_H_
+
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <type_traits>
+#include <semaphore.h>
+
+#include "MsgQueue.h"
+#include "nccl.h"
+#include "core.h"
+#include "shm.h"
+
+// ShmObject abstracts away the nitty-gritty when multiple processes need to handle opening a shared
+// memory object at the same time.
+
+static ncclResult_t shmSetupExclusive(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
+  *fd = shm_open(shmname, O_CREAT | O_RDWR | O_EXCL, S_IRUSR | S_IWUSR);
+  if (*fd == -1) return ncclSystemError;
+  if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
+  SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
+  close(*fd);
+  *fd = -1;
+  if (create) memset(*ptr, 0, shmsize);
+  return ncclSuccess;
+}
+
+template <typename T>
+class ShmObject
+{
+public:
+ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) :
+  m_shmSize(size),
+    m_shmName(fileName),
+    m_rank(rank),
+    m_numRanks(numRanks),
+    m_projid(projid),
+    m_alloc(false),
+    m_shmPtr(nullptr) {}
+
+  ShmObject() {}
+
+  ~ShmObject() {}
+
+  ncclResult_t Open();
+
+  ncclResult_t Close()
+  {
+    if (m_alloc)
+    {
+      if (m_rank == 0)
+      {
+        std::string tmpFileName = "/tmp/" + m_shmName;
+        remove(tmpFileName.c_str());
+      }
+      int retVal = shm_unlink(m_shmName.c_str());
+      if (retVal == -1 && errno != ENOENT)
+      {
+        WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+    }
+    return ncclSuccess;
+  }
+
+  T*& Get()
+  {
+    return m_shmPtr;
+  }
+protected:
+  ncclResult_t BroadcastMessage(int msgid, bool pass)
+  {
+    MsgBuffer msg;
+    msg.msg_text[0] = (pass == 0 ? 'F': 'P');
+    for (int rank = 0; rank < m_numRanks; rank++)
+    {
+      if (rank == m_rank) continue;
+      msg.msg_type = rank;
+      NCCLCHECK(MsgQueueSend(msgid, &msg, sizeof(msg), 0));
+    }
+    return ncclSuccess;
+  }
+
+  // tag for dispatch
+      template<class U>
+        struct OpenTag{};
+
+      ncclResult_t InitIfSemaphore(OpenTag<int> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<uint32_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<sem_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
+
+      size_t      m_shmSize;
+      std::string m_shmName;
+      int         m_rank;
+      int         m_numRanks;
+      int         m_projid;
+      bool        m_alloc;
+      T*          m_shmPtr;
+};
+
+template <typename T>
+ncclResult_t ShmObject<T>::Open()
+{
+  if (m_alloc == false)
+  {
+    int shmFd;
+    int protection = PROT_READ | PROT_WRITE;
+    int visibility = MAP_SHARED;
+
+    int msgid;
+    std::string tmpFileName = "/tmp/" + m_shmName;
+    NCCLCHECK(MsgQueueGetId(tmpFileName, m_projid, false, msgid));
+    if (m_rank == 0)
+    {
+      ncclResult_t resultSetup = shmSetupExclusive(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 1);
+      ncclResult_t resultSemInit = InitIfSemaphore(OpenTag<T>{});
+      if ((resultSetup != ncclSuccess && errno != EEXIST) || (resultSemInit != ncclSuccess))
+      {
+        NCCLCHECK(BroadcastMessage(msgid, false));
+        WARN("Call to ShmObject::Open in root rank failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+      NCCLCHECK(BroadcastMessage(msgid, true));
+    }
+    else
+    {
+      MsgBuffer msg;
+      NCCLCHECK(MsgQueueRecv(msgid, &msg, sizeof(msg), m_rank, true));
+      if (msg.msg_text[0] == 'P')
+      {
+        NCCLCHECK(shmSetup(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 0));
+      }
+      else
+      {
+        WARN("Call to shm_open from non-root rank in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+    }
+    m_alloc = true;
+  }
+  else
+  {
+    WARN("Cannot allocate ShmObject twice.\n");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<int> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<unsigned int> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<sem_t> tag)
+{
+  size_t numMutexes = m_shmSize / sizeof(sem_t);
+
+  for (size_t i = 0; i < numMutexes; i++)
+  {
+    SYSCHECK(sem_init(static_cast<sem_t*>(&m_shmPtr[i]), 1, 1), "sem_init");
+  }
+  return ncclSuccess;
+}
+#endif
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index e7c3c28cfb..a22caaa989 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
+*  Modifications Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  * See LICENSE.txt for license information
  ************************************************************************/
 
@@ -8,4 +8,7 @@
 #include "common.h"
 #include "collectives.h"
 
-IMPL_COLL_R(AllReduce);
+// [RCCL]
+// IMPL_COLL_R(AllReduce);
+IMPL_COLL_CLIQUE(AllReduce);
+// [/RCCL]
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 82284147dc..bb573428ac 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
+#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
 
 template<class FUNC, typename T, int UNROLL>
 class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
@@ -473,82 +474,15 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T,
 };
 
 #include "prims_ll128.h"
+// [RCCL] RingLL128 is re-purposed as clique-based kernel
 template<class FUNC, typename T, int UNROLL>
-class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
-  public:
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_CLIQUE, FUNC, T, UNROLL> {
+public:
   __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
-    const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
-    struct ncclDevComm* comm = args->comm;
-    struct ncclChannel* channel = comm->channels+blockIdx.x;
-    struct ncclRing* ring = &channel->ring;
-    const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-    ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-    const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-    const int nranks = comm->nRanks;
-    const ssize_t loopSize = nChannels*nranks*chunkSize;
-    const ssize_t size = args->coll.count;
-
-    ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
-
-    // Compute pointers
-    const T * __restrict__ thisInput = (const T*)args->sendbuff;
-    T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
-      /////////////// begin AllReduce steps ///////////////
-      ssize_t offset;
-      int nelem;
-      int chunk;
-
-      // step 0: push data to next GPU
-      chunk = ring->devUserRanks[nranks-1];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.send(thisInput+offset, nelem);
-
-      // k-2 steps: reduce and copy to next GPU
-      for (int j=2; j<nranks; ++j) {
-        chunk = ring->devUserRanks[nranks-j];
-        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-        nelem = min(chunkSize, size-offset);
-
-        LLprims.recvReduceSend(thisInput+offset, nelem);
-      }
-
-      // step k-1: reduce this buffer and data, which will produce the final
-      // result that we store in this data and push to the next GPU
-      chunk = ring->devUserRanks[0];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
-
-      // k-2 steps: copy to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        chunk = ring->devUserRanks[nranks-j];
-        offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-        nelem = min(chunkSize, size-offset);
-
-        LLprims.recvCopySend(thisOutput+offset, nelem);
-      }
-
-      // Make final copy from buffer to dest.
-      chunk = ring->devUserRanks[1];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      LLprims.recv(thisOutput+offset, nelem);
-    }
+    LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
   }
 };
+// [/RCCL]
 
 template<class FUNC, typename T, int UNROLL>
 class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index d96e2e307b..80fdca2b63 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -70,16 +70,46 @@
   NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum)
 
+// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
+#define NCCL_FUNC5B(func, algo, redop, type) \
+  NCCL_FUNC_NAME(func, algo, LL,     redop, type), \
+  NCCL_FUNC_NAME(func, algo, LL128,  redop, type), \
+  NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
+
+#define NCCL_FUNC4B(func, redop, type) \
+  NCCL_FUNC5(func, TREE,    redop, type), \
+  NCCL_FUNC5B(func, RING,    redop, type), \
+  NCCL_FUNC5(func, COLLNET, redop, type)
+
+#define NCCL_FUNCS3C(func, redop) \
+  NCCL_FUNC4B(func, redop, int8_t), \
+  NCCL_FUNC4B(func, redop, uint8_t), \
+  NCCL_FUNC4B(func, redop, int32_t), \
+  NCCL_FUNC4B(func, redop, uint32_t), \
+  NCCL_FUNC4B(func, redop, int64_t), \
+  NCCL_FUNC4B(func, redop, uint64_t), \
+  NCCL_FUNC4B(func, redop, half), \
+  NCCL_FUNC4B(func, redop, float), \
+  NCCL_FUNC4B(func, redop, double), \
+  NCCL_FUNC4B(func, redop, rccl_bfloat16)
+
+#define NCCL_FUNCS2C(func) \
+  NCCL_FUNCS3C(func, Sum ), \
+  NCCL_FUNCS3C(func, Prod), \
+  NCCL_FUNCS3C(func, Max ), \
+  NCCL_FUNCS3C(func, Min )
+
 // Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
   NCCL_FUNCS2B(Broadcast), \
   NCCL_FUNCS2A(Reduce), \
   NCCL_FUNCS2B(AllGather), \
   NCCL_FUNCS2A(ReduceScatter), \
-  NCCL_FUNCS2A(AllReduce), \
+  NCCL_FUNCS2C(AllReduce), \
   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \
   NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \
   NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) }
+// [/RCCL]
 
 // Must be consistent with the ncclFuncSet enum
 using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
@@ -93,7 +123,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
   NCCL_FUNCS2A(Reduce),
   NCCL_FUNCS2B(AllGather),
   NCCL_FUNCS2A(ReduceScatter),
-  NCCL_FUNCS2A(AllReduce),
+  NCCL_FUNCS2C(AllReduce),
   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
   NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t),
   NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t),
@@ -324,7 +354,7 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, red
 // Only generate inline kernels for LL
 #define IMPL_COLL4(func, algo, redop, type, ncclType) \
   IMPL_COLL_FUNC(func, algo, LL,     redop, type) \
-  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
 
 #define IMPL_COLL3(func, redop, type, ncclType) \
   IMPL_COLL4(func, TREE,    redop, type, ncclType) \
@@ -350,6 +380,37 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, red
   IMPL_COLL2(func, Min) \
   IMPL_COLL2(func, Max)
 
+// [RCCL] Define clique-based implementations (repurposed LL128)
+#define IMPL_COLL4_CLIQUE(func, algo, redop, type, ncclType) \
+  IMPL_COLL_FUNC(func, algo, LL,     redop, type) \
+  IMPL_COLL_FUNC(func, algo, LL128,  redop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
+
+#define IMPL_COLL3_CLIQUE(func, redop, type, ncclType) \
+  IMPL_COLL4(func, TREE,    redop, type, ncclType) \
+  IMPL_COLL4_CLIQUE(func, RING,    redop, type, ncclType) \
+  IMPL_COLL4(func, COLLNET, redop, type, ncclType)
+
+#define IMPL_COLL2_CLIQUE(func, redop) \
+  IMPL_COLL3_CLIQUE(func, redop, int8_t,   ncclInt8) \
+  IMPL_COLL3_CLIQUE(func, redop, uint8_t,  ncclUint8) \
+  IMPL_COLL3_CLIQUE(func, redop, int32_t,  ncclInt32) \
+  IMPL_COLL3_CLIQUE(func, redop, uint32_t, ncclUint32) \
+  IMPL_COLL3_CLIQUE(func, redop, int64_t,  ncclInt64) \
+  IMPL_COLL3_CLIQUE(func, redop, uint64_t, ncclUint64) \
+  IMPL_COLL3_CLIQUE(func, redop, half,     ncclFloat16) \
+  IMPL_COLL3_CLIQUE(func, redop, float,    ncclFloat32) \
+  IMPL_COLL3_CLIQUE(func, redop, double,   ncclFloat64) \
+  IMPL_COLL3_CLIQUE(func, redop, rccl_bfloat16,   ncclBfloat16)
+
+#define IMPL_COLL_CLIQUE(func) \
+  IMPL_COLL2_CLIQUE(func, Sum) \
+  IMPL_COLL2_CLIQUE(func, Prod) \
+  IMPL_COLL2_CLIQUE(func, Min) \
+  IMPL_COLL2_CLIQUE(func, Max)
+// [/RCCL]
+
+
 // Copy primitives only define one function for copy
 #define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
 
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 1aaef238ff..30c9619887 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -382,6 +382,10 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 // Multiply UNROLL by 2 if single source/single destination
 #define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
+#else
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
 #endif
 
 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
diff --git a/src/enqueue.cc b/src/enqueue.cc
index e32b519974..f5e769a5a2 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,8 @@
 #include "argcheck.h"
 #include "coll_net.h"
 #include "graph/topo.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_ext.h>
 
 // Only generate inline kernels for LL
 #define NCCL_FUNC5(func, algo, redop, dtype) \
@@ -126,6 +128,10 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params)
     STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
   }
 
+  { // [RCCL] Wait for any clique-based collectives
+    NCCLCHECK(comm->cliqueManager->WaitForPointers());
+  } // [/RCCL]
+
   // Find the first operation, choose the kernel accordingly and pass it
   // as the first argument.
   struct ncclChannel* c0 = comm->channels;
@@ -224,7 +230,8 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
         (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
   }
 
-
+  hipEvent_t startEvent;
+  hipEvent_t stopEvent;
   if (comm->launchMode == ncclComm::PARALLEL) {
     hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
   } else {
@@ -283,6 +290,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   info->algorithm = -1;
   info->protocol = -1;
   int nAlgos = NCCL_NUM_ALGORITHMS;
+
   // Check collNet support
   int collNetTypeSupport = 0;
   if (info->comm->collNetSupport)
@@ -321,7 +329,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   }
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
-  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync 
+  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
   if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
 #endif
   info->nChannels = nc;
@@ -401,6 +409,33 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
 
   work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
 
+  { // [RCCL] Check for clique-based kernel support
+    if (info->comm->cliqueManager->IsSupported(info->coll,
+                                               info->count,
+                                               info->datatype,
+                                               info->op))
+    {
+      // Declare the input / output pointers being used (to exchange via IPC with other ranks)
+      NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->comm->opCount,
+                                                           info->sendbuff,
+                                                           info->recvbuff));
+
+
+      info->algorithm = NCCL_ALGO_RING;
+      info->protocol = NCCL_PROTO_CLIQUE;
+      // Determine the number of channels to use for clique-kernel
+      NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
+                                                               info->count,
+                                                               info->datatype,
+                                                               info->op,
+                                                               info->comm->nChannels,
+                                                               &work->clique.nChannels));
+      work->clique.count = info->count;
+      work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+      return ncclSuccess;
+    }
+  } // [RCCL]
+
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
   int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -487,7 +522,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
 
   info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
 
-  int nChannels = work.coll.nChannels;
+  int nChannels = (info->coll == ncclFuncAllToAllv) ? work.a2av.nChannels : work.coll.nChannels;
   int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
 
   for (int bid=0; bid<nChannels*nSubChannels; bid++) {
@@ -509,6 +544,13 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
     } else {
       work.coll.bid = bid % nChannels;
     }
+
+    // [RCCL] Setup pointers to where all the input/output pointers will be
+    if (info->protocol == NCCL_PROTO_CLIQUE) {
+      NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work));
+    }
+    // [/RCCL]
+
     struct ncclWork* w;
     NCCLCHECK(getNextOp(channel, &w, &work));
     if (info->coll == ncclFuncAllToAllv) {
@@ -687,6 +729,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     } else {
       NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
     }
+
 end:
     if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
     ncclAsyncErrCheck(ret);
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index ef7921a19c..7926785e73 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -12,7 +12,7 @@
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
diff --git a/src/include/comm.h b/src/include/comm.h
index 4858f946d9..4817162ba6 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -10,6 +10,9 @@
 
 #include "transport.h"
 #include "p2p.h"
+// [RCCL]
+#include "clique/CliqueManager.h"
+// [/RCCL]
 
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
@@ -156,8 +159,12 @@ struct ncclComm {
   int p2pSendCount;
   int p2pRecvCount;
 
-  // RCCL AllToAll/Scatter/Gather API
-  bool alltoallDisable;
+  // [RCCL]
+  bool alltoallDisable;            // RCCL AllToAll/Scatter/Gather API
+  CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
+  int rootPid;                     // Process ID of root
+  // [/RCCL]
+
 };
 
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 7fa6842b8d..065d7590e1 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,6 +12,9 @@
 #include "rccl_bfloat16.h"
 #include "align.h"
 #include <stdint.h>
+// [RCCL] Support for clique-based kernels
+#include "clique/CliqueCommon.h"
+// [/RCCL]
 
 // Convert volatile access to atomic
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -22,6 +25,7 @@
 #define STORE(DST, SRC) *(DST) = (SRC)
 #endif
 
+
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
 typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t;
 extern const char* ncclFuncStr[];
@@ -35,6 +39,7 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_LL 0
 #define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_CLIQUE 1  // [RCCL] Clique takes up same protocol as unused LL128
 #define NCCL_PROTO_SIMPLE 2
 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
 
@@ -188,6 +193,17 @@ struct ncclWorkElem {
       uint8_t bid;
       uint8_t nChannels;
     } a2av;
+    // [RCCL] Clique-based arguments
+    //        NOTE: Follows same field structure as coll
+    //              because nChannels is accessed from "coll" struct.
+    struct {
+      size_t count;
+      cliqueDevicePtrs_t* ptrs;
+      uint32_t unused;
+      uint8_t bid;
+      uint8_t nChannels;
+    } clique;
+    // [/RCCL]
     uint64_t align[3];
   };
 };
diff --git a/src/init.cc b/src/init.cc
index 421ceeda07..b534264dee 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -28,6 +28,10 @@
 #include <unistd.h>
 #include "graph/topo.h"
 
+// [RCCL]
+#include "clique/CliqueManager.h"
+// [/RCCL]
+
 #define STR2(v) #v
 #define STR(v) STR2(v)
 
@@ -357,6 +361,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
 }
 
 RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1);
+RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);
 
 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   if (ndev < 1) {
@@ -738,7 +743,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int nranks = comm->nRanks;
   uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
   TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+  // [RCCL] Collect the PID of the root
+  int rootPid;
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap, &rootPid));
+  // [/RCCL]
 
   // AllGather1 - begin
   struct {
@@ -811,6 +819,50 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Print final topology
   NCCLCHECK(ncclTopoPrint(comm->topo));
 
+  { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
+    CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
+    if (intraRanks == nranks)
+    {
+      // Check that all the GPUs have peer access to one another
+      bool hasPeerAccess = true;
+      for (int i = 0; i < nranks && hasPeerAccess; i++)
+      {
+        int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
+        for (int j = 0; j < nranks; j++)
+        {
+          if (i == j) continue;
+          int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
+          int p2p;
+          if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
+          {
+            hasPeerAccess = false;
+            break;
+          }
+        }
+      }
+      if (hasPeerAccess)
+      {
+        if (intraRanks == nranks)
+          cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
+        else
+          cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
+      }
+
+      // For now, only enable clique-based kernels on CR8_G topologies, unless explicitly asked
+      if (!rcclParamForceEnableClique())
+      {
+        // Disable clique-kernel support if not on CR8 topology
+        if (!(comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G))
+        {
+          INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (force enable with RCCL_FORCE_ENABLE_CLIQUE)");
+          cliqueMode = CliqueManager::CLIQUE_DISABLED;
+        }
+      }
+    }
+    comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
+    NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
+  } // [/RCCL]
+
   // Get rings and trees
   struct ncclTopoGraph ringGraph;
   ringGraph.id = 0;
@@ -1217,6 +1269,10 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
     return ncclInvalidArgument;
   }
 
+  // [RCCL] Delete CliqueManager if it exists
+  if (comm->cliqueManager) delete comm->cliqueManager;
+  // [/RCCL]
+
   return commDestroy(comm);
 }
 
diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp
index 3c7d6722b9..a2715ce597 100644
--- a/test/CorrectnessTest.hpp
+++ b/test/CorrectnessTest.hpp
@@ -418,23 +418,24 @@ namespace CorrectnessTests
                         switch (dataset.dataType)
                         {
                         case ncclInt8:
-                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
+                            printf("Expected %d.  Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j);
+                            break;
                         case ncclUint8:
-                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
+                            printf("Expected %u.  Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break;
                         case ncclInt32:
-                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
+                            printf("Expected %d.  Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break;
                         case ncclUint32:
-                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
+                            printf("Expected %u.  Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break;
                         case ncclInt64:
-                            printf("Expected %ld.  Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
+                            printf("Expected %ld.  Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break;
                         case ncclUint64:
-                            printf("Expected %lu.  Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
+                            printf("Expected %lu.  Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break;
                         case ncclFloat32:
-                            printf("Expected %f.  Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
+                            printf("Expected %f.  Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break;
                         case ncclFloat64:
-                            printf("Expected %lf.  Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
+                            printf("Expected %lf.  Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break;
                         case ncclBfloat16:
-                            printf("Expected %f.  Output %f on device %d[%d]\n", (float)outputB2[j], (float)expectedB2[j], i, j); break;
+                            printf("Expected %f.  Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break;
                         default:
                             fprintf(stderr, "[ERROR] Unsupported datatype\n");
                             exit(0);
diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp
index 992b7c32d4..28f76e52d2 100644
--- a/test/test_AllReduce.cpp
+++ b/test/test_AllReduce.cpp
@@ -58,6 +58,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index e7be7385a1..aa842975bd 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -114,7 +114,7 @@ namespace CorrectnessTests
                                 // Number of devices
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
-                                testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
-                            CorrectnessTest::PrintToStringParamName());
+                                testing::Values(false, true),
+                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1", "RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp
index b3ab2ca09b..f94dd6cf6a 100644
--- a/test/test_GroupCalls.cpp
+++ b/test/test_GroupCalls.cpp
@@ -116,6 +116,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                              CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/tools/HelloRccl/HelloRccl.cpp b/tools/HelloRccl/HelloRccl.cpp
new file mode 100644
index 0000000000..4eb72b6a52
--- /dev/null
+++ b/tools/HelloRccl/HelloRccl.cpp
@@ -0,0 +1,257 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <sys/socket.h>
+#include <ifaddrs.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <cstdio>
+#include <string>
+#include <chrono>
+#include <hip/hip_runtime.h>
+#include <rccl.h>
+#include "HelloRccl.hpp"
+
+
+void Usage(char *argv0);
+void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm);
+
+int main(int argc, char **argv)
+{
+  if (getenv("NCCL_COMM_ID") && argc == 3) // Run in multi-process mode
+  {
+    int nranks   = atoi(argv[1]);
+    int rank     = atoi(argv[2]);
+    if (rank == 0) printf("Running in multi-process mode\n");
+
+    // Create communicator for this rank
+    ncclUniqueId commId;
+    NCCL_CALL(ncclGetUniqueId(&commId));
+
+    // Initialize communicator
+    ncclComm_t comm;
+    HIP_CALL(hipSetDevice(rank));
+    NCCL_CALL(ncclCommInitRank(&comm, nranks, commId, rank));
+
+    // Run the test
+    ExecuteTest(1, rank, nranks, &comm);
+  }
+  else if (argc == 2) // Run in single-process mode
+  {
+    printf("Running in single-process mode\n");
+
+    int nranks   = atoi(argv[1]);
+
+    // Initialize communicators for each rank
+    ncclComm_t comm[nranks];
+    NCCL_CALL(ncclCommInitAll(comm, nranks, NULL));
+
+    // Run the test
+    ExecuteTest(nranks, 0, nranks, comm);
+  }
+  else
+  {
+    Usage(argv[0]);
+    return 1;
+  }
+  return 0;
+}
+
+void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm)
+{
+  // Test configuration settings
+  int minPow        = 10;      // Starting power of 2 input size
+  int maxPow        = 28;      // Ending power of 2 input size
+  int numWarmups    =  3;      // Number of untimed warmup iterations
+  int numIterations = 10;      // Number of timed iterations
+
+  // Allocate GPU resources for this process
+  hipStream_t stream[numIntraRank];
+  hipEvent_t  startEvent[numIntraRank];
+  hipEvent_t  stopEvent[numIntraRank];
+  for (int i = 0; i < numIntraRank; i++)
+  {
+    HIP_CALL(hipSetDevice(intraRankStartId + i));
+    HIP_CALL(hipStreamCreate(&stream[i]));
+    HIP_CALL(hipEventCreate(&startEvent[i]));
+    HIP_CALL(hipEventCreate(&stopEvent[i]));
+  }
+
+  if (intraRankStartId == 0)
+  {
+    printf("AllReduce Performance (sum of floats):\n");
+    printf("%10s %10s %10s\n", "Bytes", "CpuTime(ms)", "GpuTime(ms)");
+  }
+
+  // Loop over power-of-two input sizes
+  for (int power = minPow; power <= maxPow; power++)
+  {
+    int N = 1 << power;
+
+    // Allocate GPU memory
+    float *iputGpu[numIntraRank], *oputGpu[numIntraRank];
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipSetDevice(intraRankStartId + r));
+      HIP_CALL(hipMalloc((void **)&iputGpu[r], N * sizeof(float)));
+      HIP_CALL(hipMalloc((void **)&oputGpu[r], N * sizeof(float)));
+    }
+
+    // Allocate CPU memory for input/output
+    float *iputCpu = (float *)malloc(N * sizeof(float));
+    float *oputCpu = (float *)malloc(N * sizeof(float));
+
+    // Fill CPU memory with a simple pattern
+    for (int i = 0; i < N; i++)
+    {
+      iputCpu[i] = 1.0f;
+      oputCpu[i] = 0.0f;
+    }
+
+    // Copy the input from CPU memory to GPU memory
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipSetDevice(intraRankStartId + r));
+      HIP_CALL(hipMemcpy(iputGpu[r], iputCpu, N * sizeof(float), hipMemcpyHostToDevice));
+    }
+
+    // Perform some untimed initial warmup iterations
+    for (int iteration = 0; iteration < numWarmups; iteration++)
+    {
+      NCCL_CALL(ncclGroupStart());
+      for (int r = 0; r < numIntraRank; r++)
+      {
+        HIP_CALL(hipSetDevice(intraRankStartId + r));
+        NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
+      }
+      NCCL_CALL(ncclGroupEnd());
+    }
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipStreamSynchronize(stream[r]));
+
+    // Perform timed iterations
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipEventRecord(startEvent[r], stream[r]));
+
+    for (int iteration = 0; iteration < numIterations; iteration++)
+    {
+      NCCL_CALL(ncclGroupStart());
+      for (int r = 0; r < numIntraRank; r++)
+      {
+        HIP_CALL(hipSetDevice(intraRankStartId + r));
+        NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
+      }
+      NCCL_CALL(ncclGroupEnd());
+    }
+
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipEventRecord(stopEvent[r], stream[r]));
+
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipStreamSynchronize(stream[r]));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(cpuDelta).count();
+
+    float totalGpuTime;
+    HIP_CALL(hipEventElapsedTime(&totalGpuTime, startEvent[0], stopEvent[0]));
+
+    if (intraRankStartId == 0) printf("%10lu %10.3f %10.3f\n", N * sizeof(float), (totalCpuTime / numIterations), (totalGpuTime / numIterations));
+
+    // Validate results
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipMemcpy(oputCpu, oputGpu[r], N * sizeof(float), hipMemcpyDeviceToHost));
+      bool isOK = true;
+      int expected = numTotalRanks;
+      for (int i = 0; i < N; i++)
+      {
+        isOK &= (oputCpu[i] == expected);
+      }
+      if (!isOK)
+      {
+        printf("[ERROR] Rank %d Incorrect results for N = %d\n", intraRankStartId + r, N);
+        NCCL_CALL(ncclCommDestroy(comm[r]));
+        exit(1);
+      }
+    }
+
+    // Release GPU resources
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipFree(oputGpu[r]));
+      HIP_CALL(hipFree(iputGpu[r]));
+    }
+    free(iputCpu);
+    free(oputCpu);
+  }
+
+  for (int r = 0; r < numIntraRank; r++)
+  {
+    HIP_CALL(hipStreamDestroy(stream[r]));
+    HIP_CALL(hipEventDestroy(startEvent[r]));
+    HIP_CALL(hipEventDestroy(stopEvent[r]));
+    NCCL_CALL(ncclCommDestroy(comm[r]));
+  }
+}
+
+void Usage(char *argv0)
+{
+  printf("Single Process Usage: %s numRanks\n", argv0);
+  printf("\n");
+  printf("Multi Process Usage: %s numRanks rank\n", argv0);
+  printf(" - NCCL_COMM_ID must be set in order to use this\n\n");
+  printf(" - To use this process as the root process you may use any of the following:\n");
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("    export NCCL_COMM_ID=%s:12345\n", hostname);
+
+  // Loop over linked list of interfaces
+  struct ifaddrs *ifaddr;
+  getifaddrs(&ifaddr);
+  for (struct ifaddrs* ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next)
+  {
+    // Skip anything not based on IPv4 / IPv6
+    int family = ifa->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6) continue;
+
+    // Skip iPv6 loopback interface
+    if (family == AF_INET6)
+    {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(ifa->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    socklen_t saLen = (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6));
+    char host[NI_MAXHOST];
+    char service[NI_MAXSERV];
+
+    getnameinfo(ifa->ifa_addr, saLen, host, NI_MAXHOST, service, NI_MAXSERV,
+                NI_NUMERICHOST|NI_NUMERICSERV);
+
+    std::string result = std::string(host) + ":12345";
+    printf("    export NCCL_COMM_ID=%s\n", result.c_str());
+  }
+  freeifaddrs(ifaddr);
+}
diff --git a/tools/HelloRccl/HelloRccl.hpp b/tools/HelloRccl/HelloRccl.hpp
new file mode 100644
index 0000000000..55d24ecd5d
--- /dev/null
+++ b/tools/HelloRccl/HelloRccl.hpp
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HELLORCCL_HPP
+#define HELLORCCL_HPP
+#include <iostream>
+
+#define HIP_CALL(cmd)                                                 \
+  do {                                                                \
+    hipError_t error = (cmd);                                         \
+    if (error != hipSuccess)                                          \
+    {                                                                   \
+      std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+#define NCCL_CALL(cmd) \
+  do { \
+    ncclResult_t error = (cmd);                 \
+    if (error != ncclSuccess)                   \
+    {                                           \
+      std::cerr << "Encountered NCCL error (" << ncclGetErrorString(error) << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+#endif
diff --git a/tools/HelloRccl/Makefile b/tools/HelloRccl/Makefile
new file mode 100644
index 0000000000..8729c17feb
--- /dev/null
+++ b/tools/HelloRccl/Makefile
@@ -0,0 +1,21 @@
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+# Set to where RCCL is installed
+RCCL_INSTALL=../../build/release
+
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+EXE=HelloRccl
+CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL) -L$(RCCL_INSTALL) -lrccl
+
+all: $(EXE)
+
+$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f *.o $(EXE)
diff --git a/tools/HelloRccl/runTest.sh b/tools/HelloRccl/runTest.sh
new file mode 100755
index 0000000000..c6e832e3c4
--- /dev/null
+++ b/tools/HelloRccl/runTest.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+RCCL_INSTALL=../../build/release
+EXE=$PWD/HelloRccl
+LDPATH=$LD_LIBRARY_PATH:$RCCL_INSTALL
+
+echo "Single process - With clique-based kernels:"
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 LD_LIBRARY_PATH=$LDPATH $EXE 4
+
+echo "Single process - Without clique-based kernels:"
+NCCL_DEBUG=INFO LD_LIBRARY_PATH=$LDPATH $EXE 4
+
+echo "With clique-based kernels:"
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
+
+echo "Without clique-based kernels:"
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3

From 01a998b17c20ae09b7c77814d6b52c6882dd2eaf Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Thu, 28 Jan 2021 20:09:03 +0000
Subject: [PATCH 10/25] Removing in-place tests from Combined calls (no support
 for send/recv)

---
 test/test_CombinedCalls.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index aa842975bd..dc6ac1fc2d 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -114,7 +114,7 @@ namespace CorrectnessTests
                                 // Number of devices
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
-                                testing::Values(false, true),
+                                testing::Values(false),
                                 testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1", "RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                              CorrectnessTest::PrintToStringParamName());
 } // namespace

From 5f9712244256dcbf852e0d9902c0561dd95a403f Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 3 Feb 2021 02:48:30 +0000
Subject: [PATCH 11/25] Enable GPU direct RDMA read from GPU

---
 src/collectives/device/primitives.h |  1 +
 src/graph/paths.cc                  |  4 ----
 src/include/devcomm.h               |  1 +
 src/transport/net.cc                | 25 +++++++++++++++++++++++++
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 5ffb02601c..a5e52aca7a 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -158,6 +158,7 @@ class ncclPrimitives {
 
   inline __device__ void postSend() {
     if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
+    if (conn->curr_hdp_reg) STORE(conn->curr_hdp_reg, 0x1);
     STORE(connTailPtr, step += SLICESTEPS);
   }
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 567043d4f6..c9924229be 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -316,9 +316,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    return ncclSuccess;
-#else
     if (gdrReadParam < 0) {
       int nvlink = 0;
       // Since we don't know whether there are other communicators,
@@ -333,7 +330,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
       }
       if (!nvlink) return ncclSuccess;
     }
-#endif
   }
 
   // Check if we are close enough that it makes sense to enable GDR
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 065d7590e1..bbccb3ac84 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -115,6 +115,7 @@ struct ncclConnInfo {
   // allows software to explicitly initiate a flush read to HDP memory. See more
   // descriptions in primitives.h.
   uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Current GPU's HDP register
 };
 
 struct ncclConnector {
diff --git a/src/transport/net.cc b/src/transport/net.cc
index e4178366cf..1f04d3ef13 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -10,6 +10,7 @@
 #include "graph.h"
 #include <sys/time.h>
 #include "collectives.h"
+#include <hsa/hsa_ext_amd.h>
 
 struct netConnectInfo {
   ncclNetHandle_t netHandle;
@@ -32,6 +33,7 @@ struct netSendResources {
   void** mhandlesProto[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };
 
 struct netRecvResources {
@@ -99,6 +101,29 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     if (resources->buffSizes[LOC_HOSTMEM]) {
       NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
     }
+    if (resources->useGdr) {
+      //CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
+      struct data_struct {hsa_agent_t agent; int counter;} out;
+      out.counter = 0;
+      out.agent.handle = myInfo->cudaDev;
+      hsa_iterate_agents([](hsa_agent_t agent, void* data) {
+        int devId = ((struct data_struct *)data)->agent.handle;
+        hsa_device_type_t type;
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
+        if(type != HSA_DEVICE_TYPE_GPU)
+          return HSA_STATUS_SUCCESS;
+        if(((struct data_struct *)data)->counter!=devId) {
+          ((struct data_struct *)data)->counter++;
+          return HSA_STATUS_SUCCESS;
+        }
+        ((struct data_struct *)data)->agent = agent;
+        return HSA_STATUS_SUCCESS;
+      }, (void*)&out);
+      hsa_amd_hdp_flush_t hdpinfo;
+      hsa_status_t err = hsa_agent_get_info(out.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
+      resources->curr_hdp_reg = hdpinfo.HDP_MEM_FLUSH_CNTL;
+      send->conn.curr_hdp_reg = resources->curr_hdp_reg;
+    }
 
     int offsets[LOC_COUNT];
     offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;

From 9ce203dd0aaf5b59a1b9c68321c28f4ff9f7c6c6 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Thu, 4 Feb 2021 18:50:27 +0000
Subject: [PATCH 12/25] [TransferBench] Updating for 2.8.3

---
 tools/TransferBench/EnvVars.hpp       | 137 ++++
 tools/TransferBench/Makefile          |   2 +-
 tools/TransferBench/TransferBench.cpp | 996 +++++++++++++++-----------
 tools/TransferBench/TransferBench.hpp | 128 +++-
 tools/TransferBench/copy_kernel.h     | 310 --------
 tools/TransferBench/example.cfg       |  68 +-
 6 files changed, 832 insertions(+), 809 deletions(-)
 create mode 100644 tools/TransferBench/EnvVars.hpp
 delete mode 100644 tools/TransferBench/copy_kernel.h

diff --git a/tools/TransferBench/EnvVars.hpp b/tools/TransferBench/EnvVars.hpp
new file mode 100644
index 0000000000..258f0ee33c
--- /dev/null
+++ b/tools/TransferBench/EnvVars.hpp
@@ -0,0 +1,137 @@
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS      =  3;
+  int const DEFAULT_NUM_ITERATIONS   = 10;
+  int const DEFAULT_SAMPLING_FACTOR  =  1;
+  int const DEFAULT_NUM_CPU_PER_LINK =  4;
+
+  // Environment variables
+  int useHipCall;      // Use hipMemcpy/hipMemset instead of custom shader kernels
+  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
+  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
+  int useInteractive;  // Pause for user-input before starting transfer loop
+  int useSleep;        // Adds a 100ms sleep after each synchronization
+  int combineTiming;   // Combines the timing with kernel launch
+  int showAddr;        // Print out memory addresses for each Link
+  int outputToCsv;     // Output in CSV format
+  int byteOffset;      // Byte-offset for memory allocations
+  int numWarmups;      // Number of un-timed warmup iterations to perform
+  int numIterations;   // Number of timed iterations to perform
+  int samplingFactor;  // Affects how many different values of N are generated (when N set to 0)
+  int numCpuPerLink;   // Number of CPU child threads to use per CPU link
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
+    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
+    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 0);
+    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
+    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
+    showAddr        = GetEnvVar("SHOW_ADDR"        , 0);
+    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
+    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
+    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
+    numIterations   = GetEnvVar("NUM_ITERATIONS"   , DEFAULT_NUM_ITERATIONS);
+    samplingFactor  = GetEnvVar("SAMPLING_FACTOR"  , DEFAULT_SAMPLING_FACTOR);
+    numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
+
+    // Perform some basic validation
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (numIterations <= 0)
+    {
+      printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (numCpuPerLink < 1)
+    {
+      printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
+      exit(1);
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
+    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
+    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
+    printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
+    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
+    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
+  }
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("Run configuration\n");
+      printf("=====================================================\n");
+      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
+             useHipCall ? "HIP functions" : "custom kernels");
+      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
+             useMemset ? "memset" : "memcopy");
+      if (useHipCall && !useMemset)
+      {
+        char* env = getenv("HSA_ENABLE_SDMA");
+        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
+               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+      }
+      printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
+             useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
+      printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
+             useInteractive ? "interactive" : "non-interactive");
+      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
+             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
+      printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
+             showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
+      printf("\n");
+    }
+  };
+
+private:
+  // Helper function that gets parses environment variable or sets to default value
+  int GetEnvVar(std::string const varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+};
+
+#endif
diff --git a/tools/TransferBench/Makefile b/tools/TransferBench/Makefile
index af0b4597b7..c51bd47b4c 100644
--- a/tools/TransferBench/Makefile
+++ b/tools/TransferBench/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC=$(HIP_PATH)/bin/hipcc
 
 EXE=TransferBench
-CXXFLAGS = -O3 -I../../src/include -I.
+CXXFLAGS = -O3 -I../../src/include -I. -lnuma
 
 all: $(EXE)
 
diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index 732a405ee7..faed8dae44 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -24,11 +24,13 @@ THE SOFTWARE.
 // on the same node
 
 #include "TransferBench.hpp"
+#include <numa.h>
+#include <numaif.h>
+#include <stack>
+#include <thread>
 
 // Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_LINK = (1<<26);
-int    const DEFAULT_NUM_WARMUPS    = 3;
-int    const DEFAULT_NUM_ITERATIONS = 10;
+size_t const DEFAULT_BYTES_PER_LINK = (1<<26);  // Amount of data transferred per Link
 
 int main(int argc, char **argv)
 {
@@ -41,126 +43,13 @@ int main(int argc, char **argv)
   }
 
   // If a negative value is listed for N, generate a comprehensive config file for this node
-  if (argc > 2 && atoi(argv[2]) < 0)
+  if (argc > 2 && atoll(argv[2]) < 0)
   {
     GenerateConfigFile(argv[1], -1*atoi(argv[2]));
     exit(0);
   }
 
-  // Collect environment variables / display current run configuration
-  bool useHipCall      = getenv("USE_HIP_CALL");       // Use hipMemcpy/hipMemset instead of custom shader kernels
-  bool useMemset       = getenv("USE_MEMSET");         // Perform a memset instead of a copy (ignores source memory)
-  bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM");  // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
-  bool useSingleSync   = getenv("USE_SINGLE_SYNC");    // Perform synchronization only once after all iterations instead of per iteration
-  bool useInteractive  = getenv("USE_INTERACTIVE");    // Pause for user-input before starting transfer loop
-  bool useSleep        = getenv("USE_SLEEP");          // Adds a 100ms sleep after each synchronization
-  bool reuseStreams    = getenv("REUSE_STREAMS");      // Re-use streams instead of creating / destroying per test
-  bool showAddr        = getenv("SHOW_ADDR");          // Print out memory addresses for each Link
-  bool outputToCsv     = getenv("OUTPUT_TO_CSV");      // Output in CSV format
-  int  byteOffset      = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
-  int  numWarmups      = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
-  int  numIterations   = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
-
-  // Determine number of bytes to run per link
-  // If a non-zero number of bytes is specified, use it
-  // Otherwise generate array of bytes values to execute over
-  std::vector<size_t> valuesOfN;
-  size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK;
-
-  if (numBytesPerLink % 128)
-  {
-    printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
-    exit(1);
-  }
-
-  if (numBytesPerLink != 0)
-  {
-    size_t N = numBytesPerLink / sizeof(float);
-    valuesOfN.push_back(N);
-  }
-  else
-  {
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int decimationFactor = 1;  // This can be modified to increase number of samples between powers of two
-      int delta = std::max(32, N / decimationFactor);
-      int curr = N;
-      while (curr < N * 2)
-      {
-        valuesOfN.push_back(curr);
-        curr += delta;
-      }
-    }
-  }
-
-  if (byteOffset % 4)
-  {
-    printf("[ERROR] byteOffset must be a multiple of 4\n");
-    exit(1);
-  }
-  int initOffset = byteOffset / sizeof(float);
-
-  char *env;
-  if (!outputToCsv)
-  {
-    printf("Run configuration\n");
-    printf("=====================================================\n");
-    printf("%-20s %8s: Using %s\n",
-           "USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
-           useHipCall ? "HIP functions" : "custom kernels");
-    printf("%-20s %8s: Performing %s\n",
-           "USE_MEMSET", useMemset ? "(set)" : "(unset)",
-           useMemset ? "memset" : "memcopy");
-    if (useHipCall && !useMemset)
-    {
-      env = getenv("HSA_ENABLE_SDMA");
-      printf("%-20s %8s: %s\n",
-             "HSA_ENABLE_SDMA", env ? env : "(unset)",
-             (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
-    }
-    printf("%-20s %8s: GPU destination memory type: %s-grained\n",
-           "USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
-           useFineGrainMem ? "fine" : "coarse");
-    printf("%-20s %8s: %s\n",
-           "USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
-           useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
-    printf("%-20s %8s: Running in %s mode\n",
-           "USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
-           useInteractive ? "interactive" : "non-interactive");
-    printf("%-20s %8s: %s\n",
-           "USE_SLEEP", useSleep ? "(set)" : "(unset)",
-           useSleep ? "Add sleep after each sync" : "No sleep per sync");
-    printf("%-20s %8s: %s\n",
-           "REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
-           reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
-    printf("%-20s %8s: %s\n",
-           "SHOW_ADDR", showAddr ? "(set)" : "(unset)",
-           showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
-    env = getenv("OUTPUT_TO_CSV");
-    printf("%-20s %8s: Output to csv\n",
-           "OUTPUT_TO_CSV", env ? env : "(unset)");
-    env = getenv("BYTE_OFFSET");
-    printf("%-20s %8s: Using byte offset of %d\n",
-           "BYTE_OFFSET", env ? env : "(unset)", byteOffset);
-    env = getenv("NUM_WARMUPS");
-    printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
-           "NUM_WARMUPS", env ? env : "(unset)", numWarmups);
-    env = getenv("NUM_ITERATIONS");
-    printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
-           "NUM_ITERATIONS", env ? env : "(unset)", numIterations);
-    printf("\n");
-  }
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  if (numGpuDevices < 1)
-  {
-    printf("[ERROR] No GPU devices found\n");
-    exit(1);
-  }
-
-  // Read configuration file
+  // Check that Link configuration file can be opened
   FILE* fp = fopen(argv[1], "r");
   if (!fp)
   {
@@ -168,148 +57,177 @@ int main(int argc, char **argv)
     exit(1);
   }
 
-  // Track links that get used
-  std::map<std::pair<int, int>, int> linkMap;
-  std::vector<std::vector<hipStream_t>> streamCache(numGpuDevices);
-
-  // Print CSV header
-  if (outputToCsv)
+  // Check for NUMA library support
+  if (numa_available() == -1)
   {
-    printf("Test,NumBytes,ExeGpu,SrcMem,DstMem,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,numWarmups,numIters,useHipCall,useMemSet,useFineGrain,useSingleSync,resuseStreams\n");
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
   }
 
-  // Loop over each line in the configuration file
+  // Collect environment variables / display current run configuration
+  EnvVars ev;
+  ev.DisplayEnvVars();
+
+  // Determine number of bytes to run per Link
+  // If a non-zero number of bytes is specified, use it
+  // Otherwise generate array of bytes values to execute over
+  std::vector<size_t> valuesOfN;
+  size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK;
+  PopulateTestSizes(numBytesPerLink, ev.samplingFactor, valuesOfN);
+
+  // Find the largest N to be used - memory will only be allocated once per link config
+  size_t maxN = valuesOfN[0];
+  for (auto N : valuesOfN)
+    maxN = std::max(maxN, N);
+
+  int const initOffset = ev.byteOffset / sizeof(float);
+  std::stack<std::thread> threads;
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  // Track links that get used
+  std::set<std::pair<int, int>> peerAccessTracker;
+
+  // Print CSV header
+  if (ev.outputToCsv)
+  {
+    printf("Test,NumBytes,Executor,SrcMem,DstMem,CUs,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters,useHipCall,useMemSet,useSingleSync,combinedTiming\n");
+  }
+
+  // Loop over each line in the Link configuration file
   int testNum = 0;
   char line[2048];
   while(fgets(line, 2048, fp))
   {
     // Parse links from configuration file
     std::vector<Link> links;
-    ParseLinks(line, links);
+    ParseLinks(line, numCpuDevices, numGpuDevices, links);
 
     int const numLinks = links.size();
     if (numLinks == 0) continue;
     testNum++;
 
+    // Prepare link
+    for (int i = 0; i < numLinks; i++)
+    {
+      // Get some aliases to link variables
+      MemType const& exeMemType  = links[i].exeMemType;
+      int     const& exeIndex    = links[i].exeIndex;
+      MemType const& srcMemType  = links[i].srcMemType;
+      MemType const& dstMemType  = links[i].dstMemType;
+      int     const& srcIndex    = links[i].srcIndex;
+      int     const& dstIndex    = links[i].dstIndex;
+      int     const& blocksToUse = links[i].numBlocksToUse;
+
+      // Enable peer-to-peer access if necessary (can only be called once per unique pair)
+      if (exeMemType == MEM_GPU)
+      {
+        // Ensure executing GPU can access source memory
+        if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
+        {
+          auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
+           if (!peerAccessTracker.count(exeSrcPair))
+          {
+            EnablePeerAccess(exeIndex, srcIndex);
+            peerAccessTracker.insert(exeSrcPair);
+          }
+        }
+
+        // Ensure executing GPU can access destination memory
+        if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
+        {
+          auto exeDstPair = std::make_pair(exeIndex, dstIndex);
+          if (!peerAccessTracker.count(exeDstPair))
+          {
+            EnablePeerAccess(exeIndex, dstIndex);
+            peerAccessTracker.insert(exeDstPair);
+          }
+        }
+      }
+
+      // Allocate (maximum) source / destination memory based on type / device index
+      AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].srcMem);
+      AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].dstMem);
+
+      // Prepare execution agent
+      if (exeMemType == MEM_GPU)
+      {
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipEventCreate(&links[i].startEvent));
+        HIP_CALL(hipEventCreate(&links[i].stopEvent));
+        HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * blocksToUse));
+        HIP_CALL(hipStreamCreate(&links[i].stream));
+      }
+      else if (exeMemType == MEM_CPU)
+      {
+        links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam));
+      }
+    }
+
     // Loop over all the different number of bytes to use per Link
     for (auto N : valuesOfN)
     {
-      if (!outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
-      float*                  linkSrcMem[numLinks];        // Source memory per Link
-      float*                  linkDstMem[numLinks];        // Destination memory per Link
-      hipStream_t             streams[numLinks];           // hipStream to use per Link
-      hipEvent_t              startEvents[numLinks];       // Start event per Link
-      hipEvent_t              stopEvents[numLinks];        // Stop event per Link
-      hipEvent_t              dummyEvents[numLinks];       // Dummy event per Link
-      std::vector<BlockParam> cpuBlockParams[numLinks];    // CPU copy of block parameters
-      BlockParam*             gpuBlockParams[numLinks];    // GPU copy of block parameters
-
-      // Clear counters
-      int linkCount[numGpuDevices];
-      for (int i = 0; i < numGpuDevices; i++)
-        linkCount[i] = 0;
+      if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
 
+      // Prepare links based on current N
       for (int i = 0; i < numLinks; i++)
       {
-        MemType srcMemType  = links[i].srcMemType;
-        MemType dstMemType  = links[i].dstMemType;
-        int     exeIndex    = links[i].exeIndex;
-        int     srcIndex    = links[i].srcIndex;
-        int     dstIndex    = links[i].dstIndex;
-        int     blocksToUse = links[i].numBlocksToUse;
-
-        // Check for valid src/dst indices
-        if ((srcIndex < 0 || srcIndex >= numGpuDevices) ||
-            (dstIndex < 0 || dstIndex >= numGpuDevices) ||
-            (exeIndex < 0 || exeIndex >= numGpuDevices))
-        {
-          printf("[ERROR] Invalid link %d:(%c%d->%c%d) GPU index must be between 0 and %d inclusively\n",
-                 exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices-1);
-          exit(1);
-        }
-
-        // Enable peer-to-peer access if this is the first time seeing this pair
-        if (srcMemType == MEM_GPU && dstMemType == MEM_GPU)
-        {
-          auto linkPair = std::make_pair(srcIndex, dstIndex);
-          linkMap[linkPair]++;
-          if (linkMap[linkPair] == 1 && srcIndex != dstIndex)
-          {
-            int canAccess;
-            HIP_CALL(hipDeviceCanAccessPeer(&canAccess, srcIndex, dstIndex));
-            if (!canAccess)
-            {
-              printf("[ERROR] Unable to enable peer access between GPU devices %d and %d\n", srcIndex, dstIndex);
-              exit(1);
-            }
-            HIP_CALL(hipSetDevice(srcIndex));
-            HIP_CALL(hipDeviceEnablePeerAccess(dstIndex, 0));
-          }
-        }
-
-        // Allocate hipEvents / hipStreams on executing GPU
-        HIP_CALL(hipSetDevice(exeIndex));
-        HIP_CALL(hipEventCreate(&startEvents[i]));
-        HIP_CALL(hipEventCreate(&stopEvents[i]));
-        HIP_CALL(hipEventCreate(&dummyEvents[i]));
-        HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
-        if (reuseStreams)
-        {
-          // If re-using streams, create new stream, otherwise point to existing stream
-          if (streamCache[exeIndex].size() <= linkCount[exeIndex])
-          {
-            streamCache[exeIndex].resize(linkCount[exeIndex] + 1);
-            HIP_CALL(hipStreamCreate(&streamCache[exeIndex][linkCount[exeIndex]]));
-          }
-          streams[i] = streamCache[exeIndex][linkCount[exeIndex]];
-        }
-        else
-        {
-          HIP_CALL(hipStreamCreate(&streams[i]));
-        }
-
-        // Allocate source / destination memory based on type / device index
-        AllocateMemory(srcMemType, srcIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkSrcMem[i]);
-        AllocateMemory(dstMemType, dstIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkDstMem[i]);
-
         // Initialize source memory with patterned data
-        CheckOrFill(MODE_FILL, N, useMemset, useHipCall, linkSrcMem[i] + initOffset);
-
-        // Count # of links / total blocks each GPU will be working on
-        linkCount[exeIndex]++;
+        CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, links[i].srcMem + initOffset);
 
         // Each block needs to know src/dst pointers and how many elements to transfer
-        // Figure out the sub-array each block does for this link
-        // NOTE: Have each sub-array to work on multiple of 32-floats (128-bytes),
-        //       but divide as evenly as possible
-        // NOTE: N is always a multiple of 32
-        int blocksWithExtra = (N / 32) % links[i].numBlocksToUse;
-        int perBlockBaseN   = (N / 32) / links[i].numBlocksToUse * 32;
-        for (int j = 0; j < links[i].numBlocksToUse; j++)
+        // Figure out the sub-array each block does for this Link
+        // - Partition N as evenly as posible, but try to keep blocks as multiples of 32,
+        //   except the very last one, for alignment reasons
+        if (links[i].exeMemType == MEM_GPU)
         {
-          BlockParam param;
-          param.N   = perBlockBaseN + ((j < blocksWithExtra) ? 32 : 0);
-          param.src = linkSrcMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
-                                                              j : blocksWithExtra) * 32) + initOffset;
-          param.dst = linkDstMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
-                                                              j : blocksWithExtra) * 32) + initOffset;
-          cpuBlockParams[i].push_back(param);
+          size_t assigned = 0;
+          int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse);
+          for (int j = 0; j < links[i].numBlocksToUse; j++)
+          {
+            BlockParam param;
+            int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+            size_t leftover = N - assigned;
+            size_t roundedN = (leftover + 31) / 32;
+            param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+            param.src = links[i].srcMem + assigned + initOffset;
+            param.dst = links[i].dstMem + assigned + initOffset;
+            assigned += param.N;
+
+            HIP_CALL(hipMemcpy(&links[i].blockParam[j], &param, sizeof(BlockParam), hipMemcpyHostToDevice));
+          }
+        }
+        else if (links[i].exeMemType == MEM_CPU)
+        {
+          // For CPU-based copy, divded based on the number of child threads
+          size_t assigned = 0;
+          int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink);
+          for (int j = 0; j < ev.numCpuPerLink; j++)
+          {
+            int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+            size_t leftover = N - assigned;
+            size_t roundedN = (leftover + 31) / 32;
+            links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+            links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset;
+            links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset;
+            assigned += links[i].blockParam[j].N;
+          }
         }
 
-        HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
-                           sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
+        // Initialize timing
+        links[i].totalTime = 0.0;
       }
 
-      // Launch kernels (warmup iterations are not counted)
       double totalCpuTime = 0;
-      double totalGpuTime[numLinks];
 
-      for (int i = 0; i < numLinks; i++) totalGpuTime[i] = 0.0;
-
-      for (int iteration = -numWarmups; iteration < numIterations; iteration++)
+      // Launch kernels (warmup iterations are not counted)
+      for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
       {
         // Pause before starting first timed iteration in interactive mode
-        if (useInteractive && iteration == 0)
+        if (ev.useInteractive && iteration == 0)
         {
           printf("Hit <Enter> to continue: ");
           scanf("%*c");
@@ -319,79 +237,26 @@ int main(int argc, char **argv)
         // Start CPU timing for this iteration
         auto cpuStart = std::chrono::high_resolution_clock::now();
 
-        // Enqueue all links
+        // Execute all links in parallel
+        for (int i = 0; i < numLinks; i++)
+          threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i])));
+
+        // Wait for all threads to finish
         for (int i = 0; i < numLinks; i++)
         {
-          HIP_CALL(hipSetDevice(links[i].exeIndex));
-
-          bool recordStart = (!useSingleSync || iteration == 0);
-          bool recordStop  = (!useSingleSync || iteration == numIterations - 1);
-
-          if (useHipCall)
-          {
-            // Record start event
-            if (recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
-
-            // Execute hipMemset / hipMemcpy
-            if (useMemset)
-              HIP_CALL(hipMemsetAsync(linkDstMem[i] + initOffset, 42, N * sizeof(float), streams[i]));
-            else
-              HIP_CALL(hipMemcpyAsync(linkDstMem[i] + initOffset,
-                                      linkSrcMem[i] + initOffset,
-                                      N * sizeof(float), hipMemcpyDeviceToDevice,
-                                      streams[i]));
-            // Record stop event
-            if (recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
-          }
-          else
-          {
-            hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel,
-                                  dim3(links[i].numBlocksToUse, 1, 1),
-                                  dim3(BLOCKSIZE, 1, 1),
-                                  0, streams[i],
-                                  recordStart ? startEvents[i] : NULL,
-                                  recordStop  ?  stopEvents[i] : NULL,
-                                  0, gpuBlockParams[i]);
-          }
-        }
-
-        // Synchronize per iteration, unless in single sync mode, in which case
-        // synchronize during last warmup / last actual iteration
-        if (!useSingleSync || iteration == -1 || iteration == numIterations - 1)
-        {
-          for (int i = 0; i < numLinks; i++)
-          {
-            HIP_CALL(hipSetDevice(links[i].exeIndex));
-            hipStreamSynchronize(streams[i]);
-          }
+          threads.top().join();
+          threads.pop();
         }
 
         // Stop CPU timing for this iteration
         auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
         double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-        if (useSleep) usleep(100000);
 
-        if (iteration >= 0)
-        {
-          totalCpuTime += deltaSec;
-
-          // Record GPU timing
-          if (!useSingleSync || iteration == numIterations - 1)
-          {
-            for (int i = 0; i < numLinks; i++)
-            {
-              HIP_CALL(hipSetDevice(links[i].exeIndex));
-              HIP_CALL(hipEventSynchronize(startEvents[i]));
-              HIP_CALL(hipEventSynchronize(stopEvents[i]));
-              float gpuDeltaMsec;
-              HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[i], stopEvents[i]));
-              totalGpuTime[i] += gpuDeltaMsec;
-            }
-          }
-        }
+        if (iteration >= 0) totalCpuTime += deltaSec;
       }
 
-      if (useInteractive)
+      // Pause for interactive mode
+      if (ev.useInteractive)
       {
         printf("Transfers complete. Hit <Enter> to continue: ");
         scanf("%*c");
@@ -400,150 +265,152 @@ int main(int argc, char **argv)
 
       // Validate that each link has transferred correctly
       for (int i = 0; i < numLinks; i++)
-        CheckOrFill(MODE_CHECK, N, useMemset, useHipCall, linkDstMem[i] + initOffset);
+        CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, links[i].dstMem + initOffset);
 
       // Report timings
-      totalCpuTime = totalCpuTime / (1.0 * numIterations) * 1000;
-      double totalBandwidthGbs = 0.0;
+      totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000;
+      double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime;
       for (int i = 0; i < numLinks; i++)
       {
-        double linkDurationMsec = totalGpuTime[i] / (1.0 * numIterations);
+        double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations);
         double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-        totalBandwidthGbs += linkBandwidthGbs;
-        if (!outputToCsv)
+        if (!ev.outputToCsv)
         {
-          printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %9s |",
+          printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s",
                  i + 1,
                  MemTypeStr[links[i].srcMemType], links[i].srcIndex,
-                 links[i].exeIndex, links[i].numBlocksToUse,
+                 MemTypeStr[links[i].exeMemType], links[i].exeIndex,
+                 links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse,
                  MemTypeStr[links[i].dstMemType], links[i].dstIndex,
                  linkBandwidthGbs, linkDurationMsec,
                  GetLinkDesc(links[i]).c_str());
-          if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
+          if (ev.showAddr) printf(" %16p | %16p |", links[i].srcMem + initOffset, links[i].dstMem + initOffset);
           printf("\n");
         }
         else
         {
-          printf("%d,%lu,%02d,%c%02d,%c%02d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%s,%s,%s,%s,%s\n",
-                 testNum, N * sizeof(float), links[i].exeIndex,
+          printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%d,%s,%s,%s,%s\n",
+                 testNum, N * sizeof(float),
                  MemTypeStr[links[i].srcMemType], links[i].srcIndex,
+                 MemTypeStr[links[i].exeMemType], links[i].exeIndex,
                  MemTypeStr[links[i].dstMemType], links[i].dstIndex,
+                 links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse,
                  linkBandwidthGbs, linkDurationMsec,
                  GetLinkDesc(links[i]).c_str(),
-                 linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset,
-                 numWarmups, numIterations,
-                 useHipCall ? "true" : "false",
-                 useMemset ? "true" : "false",
-                 useFineGrainMem ? "true" : "false",
-                 useSingleSync ? "true" : "false",
-                 reuseStreams ? "true" : "false");
+                 links[i].srcMem + initOffset, links[i].dstMem + initOffset,
+                 ev.byteOffset,
+                 ev.numWarmups, ev.numIterations,
+                 ev.useHipCall ? "true" : "false",
+                 ev.useMemset ? "true" : "false",
+                 ev.useSingleSync ? "true" : "false",
+                 ev.combineTiming ? "true" : "false");
         }
       }
 
       // Display aggregate statistics
-      if (!outputToCsv)
+      if (!ev.outputToCsv)
       {
-        printf(" Aggregate Bandwidth                | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
+        printf(" Aggregate Bandwidth (CPU timed)    | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
       }
       else
       {
-        printf("%d,%lu,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%s,%s,%s,%s,%s\n",
-               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, numWarmups, numIterations,
-               useHipCall ? "true" : "false",
-               useMemset ? "true" : "false",
-               useFineGrainMem ? "true" : "false",
-               useSingleSync ? "true" : "false",
-               reuseStreams ? "true" : "false");
+        printf("%d,%lu,ALL,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%d,%s,%s,%s,%s\n",
+               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
+               ev.numWarmups, ev.numIterations,
+               ev.useHipCall ? "true" : "false",
+               ev.useMemset ? "true" : "false",
+               ev.useSingleSync ? "true" : "false",
+               ev.combineTiming ? "true" : "false");
       }
+    }
 
-      // Release GPU memory
-      for (int i = 0; i < numLinks; i++)
+    // Release GPU memory
+    for (int i = 0; i < numLinks; i++)
+    {
+      DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
+      DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
+
+      if (links[i].exeMemType == MEM_GPU)
       {
-        DeallocateMemory(links[i].srcMemType, links[i].srcIndex, linkSrcMem[i]);
-        DeallocateMemory(links[i].dstMemType, links[i].dstIndex, linkDstMem[i]);
-        HIP_CALL(hipFree(gpuBlockParams[i]));
-        if (!reuseStreams)
-          HIP_CALL(hipStreamDestroy(streams[i]));
-        HIP_CALL(hipEventDestroy(startEvents[i]));
-        HIP_CALL(hipEventDestroy(stopEvents[i]));
+        HIP_CALL(hipEventDestroy(links[i].startEvent));
+        HIP_CALL(hipEventDestroy(links[i].stopEvent));
+        HIP_CALL(hipStreamDestroy(links[i].stream));
+        HIP_CALL(hipFree(links[i].blockParam));
+      }
+      else if (links[i].exeMemType == MEM_CPU)
+      {
+        free(links[i].blockParam);
       }
     }
   }
   fclose(fp);
 
-  // Clean up stream cache if re-using streams
-  if (reuseStreams)
-  {
-    for (auto streamVector : streamCache)
-      for (auto stream : streamVector)
-        HIP_CALL(hipStreamDestroy(stream));
-  }
-
   return 0;
 }
 
 void DisplayUsage(char const* cmdName)
 {
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
   printf("Usage: %s configFile <N>\n", cmdName);
 
   printf("  configFile: File containing Links to execute (see below for format)\n");
   printf("  N         : (Optional) Number of bytes to transfer per link.\n");
-  printf("              If not specified, defaults to %lu bytes. Must be a multiple of 128 bytes\n", DEFAULT_BYTES_PER_LINK);
+  printf("              If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
   printf("              If 0 is specified, a range of Ns will be benchmarked\n");
   printf("              If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
   printf("\n");
   printf("Configfile Format:\n");
   printf("==================\n");
-  printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location\n");
+  printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU\n");
   printf("Each single line in the configuration file defines a set of Links to run in parallel\n");
   printf("\n");
   printf("There are two ways to specify the configuration file:\n");
   printf("\n");
   printf("1) Basic\n");
-  printf("   The basic specification assumes the same number of threadblocks/CUs used per link\n");
+  printf("   The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link\n");
   printf("   A positive number of Links is specified followed by that number of triplets describing each Link\n");
   printf("\n");
-  printf("   #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)\n");
+  printf("   #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n");
   printf("\n");
   printf("2) Advanced\n");
-  printf("   The advanced specification allows different number of threadblocks/CUs used per Link\n");
+  printf("   The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link\n");
   printf("   A negative number of links is specified, followed by quadruples describing each Link\n");
-  printf("   -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)\n");
+  printf("   -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)\n");
   printf("\n");
   printf("Argument Details:\n");
   printf("  #Links  :   Number of Links to be run in parallel\n");
-  printf("  #CUs    :   Number of threadblocks/CUs to use for a Link\n");
-  printf("  GpuIndex:   0-indexed GPU id executing the Link\n");
+  printf("  #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link\n");
   printf("  srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode\n");
+  printf("  Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)\n");
+  printf("              - C: CPU-executed  (Indexed from 0 to %d)\n", numCpuDevices-1);
+  printf("              - G: GPU-executed  (Indexed from 0 to %d)\n", numGpuDevices-1);
   printf("  dstMemL :   Destination memory location (Where the data is to be written to)\n");
-  printf("              Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)\n");
+  printf("\n");
+  printf("              Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)\n");
   printf("              Supported memory locations are:\n");
-  printf("              - P:    Pinned host memory   (on CPU, on NUMA node closest to provided GPU index)\n");
-  printf("              - G:    Global device memory (on GPU)\n");
-  printf("Round brackets may be included for human clarity, but will be ignored\n");
+  printf("              - C:    Pinned host memory       (on NUMA node, indexed from 0 to %d)\n", numCpuDevices-1);
+  printf("              - G:    Global device memory     (on GPU device indexed from 0 to %d)\n", numGpuDevices-1);
+  printf("              - F:    Fine-grain device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1);
   printf("\n");
   printf("Examples:\n");
-  printf("1 4 (0 G0 G1)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n");
-  printf("1 4 (0 G1 G0)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0\n");
-  printf("1 4 (2 P0 G2)              Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n");
-  printf("2 4 (0 G0 G1) (1 G1 G0)    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n");
-  printf("-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n");
+  printf("1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n");
+  printf("1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0\n");
+  printf("1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n");
+  printf("2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n");
+  printf("-2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n");
   printf("\n");
+  printf("Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n");
   printf("\n");
-  printf("Environment variables:\n");
-  printf("======================\n");
-  printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels\n");
-  printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
-  printf(" USE_FINEGRAIN_MEM  - Allocate fine-grained GPU memory instead of coarse-grained GPU memory\n");
-  printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
-  printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
-  printf(" USE_SLEEP          - Adds a 100ms sleep after each synchronization\n");
-  printf(" REUSE_STREAMS      - Re-use streams instead of creating / destroying per test\n");
-  printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
-  printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
-  printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-  printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
-  printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+
+  EnvVars::DisplayUsage();
 }
 
 void GenerateConfigFile(char const* cfgFile, int numBlocks)
@@ -551,13 +418,28 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   // Detect number of available GPUs and skip if less than 2
   int numGpuDevices;
   HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  printf("Generated configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
+  printf("Generating configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
   if (numGpuDevices < 2)
   {
     printf("Skipping. (Less than 2 GPUs detected)\n");
     exit(0);
   }
 
+  // Check first to see if file exists, and issue warning
+  FILE* exists = fopen(cfgFile, "r");
+  if (exists)
+  {
+    fclose(exists);
+    printf("[WARN] File %s alreadys exists.  Enter 'Y' to confirm overwrite\n", cfgFile);
+    char ch;
+    scanf(" %c", &ch);
+    if (ch != 'Y' && ch != 'y')
+    {
+      printf("Aborting\n");
+      exit(0);
+    }
+  }
+
   // Open config file for writing
   FILE* fp = fopen(cfgFile, "w");
   if (!fp)
@@ -569,18 +451,18 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   // CU testing
   fprintf(fp, "# CU scaling tests\n");
   for (int i = 1; i < 16; i++)
-    fprintf(fp, "1 %d (0 G0 G1)\n", i);
+    fprintf(fp, "1 %d (G0->G0->G1)\n", i);
   fprintf(fp, "\n");
 
   // Pinned memory testing
   fprintf(fp, "# Pinned CPU memory read tests\n");
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, "1 %d (%d C%d G%d)\n", numBlocks, i, i, i);
+    fprintf(fp, "1 %d (C0->G%d->G%d)\n", numBlocks, i, i);
   fprintf(fp, "\n");
 
   fprintf(fp, "# Pinned CPU memory write tests\n");
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, "1 %d (%d G%d C%d)\n", numBlocks, i, i, i);
+    fprintf(fp, "1 %d (G%d->G%d->C0)\n", numBlocks, i, i);
   fprintf(fp, "\n");
 
   // Single link testing GPU testing
@@ -589,7 +471,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, "1 %d (%d G%d G%d)\n", numBlocks, i, i, j);
+      fprintf(fp, "1 %d (G%d->G%d->G%d)\n", numBlocks, i, i, j);
     }
   fprintf(fp, "\n");
 
@@ -599,7 +481,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, "2 %d (%d G%d G%d) (%d G%d G%d)\n", numBlocks, i, i, j, j, j, i);
+      fprintf(fp, "2 %d (G%d->G%d->G%d) (G%d->G%d->G%d)\n", numBlocks, i, i, j, j, j, i);
     }
   fprintf(fp, "\n");
 
@@ -608,7 +490,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fprintf(fp, "%d %d", numGpuDevices, numBlocks);
   for (int i = 0; i < numGpuDevices; i++)
   {
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices);
   }
   fprintf(fp, "\n\n");
 
@@ -616,23 +498,23 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fprintf(fp, "# Simple bi-directional ring\n");
   fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks);
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices);
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
   fprintf(fp, "\n\n");
 
   // Broadcast from GPU 0
   fprintf(fp, "# GPU 0 Broadcast\n");
   fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
   for (int i = 1; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", 0, 0, i);
+    fprintf(fp, " (G%d->G%d->G%d)", 0, 0, i);
   fprintf(fp, "\n\n");
 
   // Gather to GPU 0
   fprintf(fp, "# GPU 0 Gather\n");
   fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
   for (int i = 1; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", 0, i, 0);
+    fprintf(fp, " (G%d->G%d->G%d)", 0, i, 0);
   fprintf(fp, "\n\n");
 
   // Full stress test
@@ -642,7 +524,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, " (%d G%d G%d)", i, i, j);
+      fprintf(fp, " (G%d->G%d->G%d)", i, i, j);
     }
   fprintf(fp, "\n\n");
 
@@ -687,29 +569,87 @@ void DisplayTopology()
   }
 }
 
-void ParseMemType(std::string const& token, MemType* memType, int* memIndex)
+void PopulateTestSizes(size_t const numBytesPerLink,
+                       int const samplingFactor,
+                       std::vector<size_t>& valuesOfN)
+{
+  valuesOfN.clear();
+
+  // If the number of bytes is specified, use it
+  if (numBytesPerLink != 0)
+  {
+    if (numBytesPerLink % 4)
+    {
+      printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink);
+      exit(1);
+    }
+    size_t N = numBytesPerLink / sizeof(float);
+    valuesOfN.push_back(N);
+  }
+  else
+  {
+    // Otherwise generate a range of values
+    // (Powers of 2, with samplingFactor samples between successive powers of 2)
+    for (int N = 256; N <= (1<<27); N *= 2)
+    {
+      int delta = std::max(32, N / samplingFactor);
+      int curr = N;
+      while (curr < N * 2)
+      {
+        valuesOfN.push_back(curr);
+        curr += delta;
+      }
+    }
+  }
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex)
 {
   char typeChar;
   if (sscanf(token.c_str(), " %c %d", &typeChar, memIndex) != 2)
   {
-    printf("Error parsing memory type token %s\n", token.c_str());
+    printf("[ERROR] Unable to parse memory type token %s - expecting either 'C' or 'G' or 'F' followed by an index\n", token.c_str());
     exit(1);
   }
 
   switch (typeChar)
   {
-  case 'C': case 'c': *memType = MEM_CPU; break;
-  case 'G': case 'g': *memType = MEM_GPU; break;
-  default: printf("Unrecognized memory type %s\n", token.c_str()); exit(1);
+  case 'C': case 'c':
+    *memType = MEM_CPU;
+    if (*memIndex < 0 || *memIndex >= numCpus)
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  case 'G': case 'g':
+    *memType = MEM_GPU;
+    if (*memIndex < 0 || *memIndex >= numGpus)
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  case 'F': case 'f':
+    *memType = MEM_GPU_FINE;
+    if (*memIndex < 0 || *memIndex >= numGpus)
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  default:
+    printf("[ERROR] Unrecognized memory type %s.  Expecting either 'C' or 'G' or 'F'\n", token.c_str());
+    exit(1);
   }
 }
 
-// Helper function to parse a link of link definitions
-void ParseLinks(char* line, std::vector<Link>& links)
+// Helper function to parse a list of link definitions
+void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links)
 {
-  // Replace any round brackets with spaces
+  // Replace any round brackets or '->' with spaces,
   for (int i = 0; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')') line[i] = ' ';
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
 
   links.clear();
   int numLinks = 0;
@@ -720,11 +660,12 @@ void ParseLinks(char* line, std::vector<Link>& links)
   iss >> numLinks;
   if (iss.fail()) return;
 
+  std::string exeMem;
   std::string srcMem;
   std::string dstMem;
   if (numLinks > 0)
   {
-    // Method 1: Take in triples (exeGpu, srcMem, dstMem)
+    // Method 1: Take in triples (srcMem, exeMem, dstMem)
     int numBlocksToUse;
     iss >> numBlocksToUse;
     if (numBlocksToUse <= 0)
@@ -735,47 +676,100 @@ void ParseLinks(char* line, std::vector<Link>& links)
     links.resize(numLinks);
     for (int i = 0; i < numLinks; i++)
     {
-      iss >> links[i].exeIndex >> srcMem >> dstMem;
-      ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex);
-      ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex);
+      iss >> srcMem >> exeMem >> dstMem;
+      ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);
       links[i].numBlocksToUse = numBlocksToUse;
+      if (links[i].exeMemType != MEM_CPU && links[i].exeMemType != MEM_GPU)
+      {
+        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n");
+        exit(1);
+      }
     }
   }
   else
   {
-    // Method 2: Read in quads (exeGpu, srcMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
+    // Method 2: Read in quads (srcMem, exeMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
     numLinks *= -1;
     links.resize(numLinks);
 
     for (int i = 0; i < numLinks; i++)
     {
-      iss >> links[i].exeIndex >> srcMem >> dstMem >> links[i].numBlocksToUse;
-      ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex);
-      ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex);
+      iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse;
+      ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);
+      if (links[i].exeMemType != MEM_CPU || links[i].exeMemType != MEM_GPU)
+      {
+        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n");
+        exit(1);
+      }
+
     }
   }
 }
 
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr)
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
 {
-  HIP_CALL(hipSetDevice(devIndex));
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
 
   if (memType == MEM_CPU)
   {
-    // // Allocate pinned-memory on NUMA node closest to the selected GPU
-    HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocPortable));
+    // Set numa policy prior to call to hipHostMalloc
+    unsigned long nodemask = (1ULL << devIndex);
+    long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
+    if (retCode)
+    {
+      printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", devIndex);
+      exit(1);
+    }
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8);
+    if (retCode)
+    {
+      printf("[ERROR] Unable reset to default NUMA memory policy\n");
+      exit(1);
+    }
   }
   else if (memType == MEM_GPU)
   {
-    // Allocate GPU memory
-    if (useFineGrainMem)
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
-    else
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    // Allocate GPU memory on appropriate device
+    HIP_CALL(hipSetDevice(devIndex));
+    HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+  }
+  else if (memType == MEM_GPU_FINE)
+  {
+    HIP_CALL(hipSetDevice(devIndex));
+    HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
   }
   else
   {
-    printf("Error: Unsupported memory type %d\n", memType);
+    printf("[ERROR] Unsupported memory type %d\n", memType);
     exit(1);
   }
 }
@@ -786,12 +780,51 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr)
   {
     HIP_CALL(hipHostFree(memPtr));
   }
-  else if (memType == MEM_GPU)
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
   {
     HIP_CALL(hipFree(memPtr));
   }
 }
 
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    // NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now
+    // exit(1);
+  }
+}
+
 // Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr)
 {
@@ -853,31 +886,132 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
   return result;
 }
 
+std::string GetDesc(MemType srcMemType, int srcIndex,
+                    MemType dstMemType, int dstIndex)
+{
+  if (srcMemType == MEM_CPU)
+  {
+    if (dstMemType == MEM_CPU)
+      return (srcIndex == dstIndex) ? "LOCAL" : "NUMA";
+    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
+      return "PCIE";
+    else
+      goto error;
+  }
+  else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE)
+  {
+    if (dstMemType == MEM_CPU)
+      return "PCIE";
+    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
+    {
+      if (srcIndex == dstIndex) return "LOCAL";
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount));
+        return GetLinkTypeDesc(linkType, hopCount);
+      }
+    }
+    else
+      goto error;
+  }
+error:
+  printf("[ERROR] Unrecognized memory type\n");
+  exit(1);
+}
+
 std::string GetLinkDesc(Link const& link)
 {
-  std::string result = "";
-
-  // Currently only describe links between src/dst on GPU
-  if (link.srcMemType == MEM_GPU && link.dstMemType == MEM_GPU)
-  {
-    if (link.exeIndex != link.srcIndex)
-    {
-      uint32_t linkType, hopCount;
-      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.srcIndex, link.exeIndex, &linkType, &hopCount));
-      result += GetLinkTypeDesc(linkType, hopCount);
-    }
-
-    if (link.exeIndex != link.dstIndex)
-    {
-      uint32_t linkType, hopCount;
-      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.exeIndex, link.dstIndex, &linkType, &hopCount));
-      if (result != "") result += "+";
-      result += GetLinkTypeDesc(linkType, hopCount);
-    }
-  }
-  else
-  {
-    result = "???";
-  }
-  return result;
+  return GetDesc(link.srcMemType, link.srcIndex, link.exeMemType, link.exeIndex) + "-"
+    + GetDesc(link.exeMemType, link.exeIndex, link.dstMemType, link.dstIndex);
+}
+
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
+{
+  // GPU execution agent
+  if (link.exeMemType == MEM_GPU)
+  {
+    // Switch to executing GPU
+    HIP_CALL(hipSetDevice(link.exeIndex));
+
+    bool recordStart = (!ev.useSingleSync || iteration == 0);
+    bool recordStop  = (!ev.useSingleSync || iteration == ev.numIterations - 1);
+
+    int const initOffset = ev.byteOffset / sizeof(float);
+
+    if (ev.useHipCall)
+    {
+      // Record start event
+      if (recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream));
+
+      // Execute hipMemset / hipMemcpy
+      if (ev.useMemset)
+        HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), link.stream));
+      else
+        HIP_CALL(hipMemcpyAsync(link.dstMem + initOffset,
+                                link.srcMem + initOffset,
+                                N * sizeof(float), hipMemcpyDefault,
+                                link.stream));
+      // Record stop event
+      if (recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream));
+    }
+    else
+    {
+      if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream));
+      hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
+                            dim3(link.numBlocksToUse, 1, 1),
+                            dim3(BLOCKSIZE, 1, 1),
+                            0, link.stream,
+                            (ev.combineTiming && recordStart) ? link.startEvent : NULL,
+                            (ev.combineTiming && recordStop)  ? link.stopEvent : NULL,
+                            0, link.blockParam);
+      if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream));
+    }
+
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    if (!ev.useSingleSync || iteration == -1 || iteration == ev.numIterations - 1)
+    {
+      HIP_CALL(hipStreamSynchronize(link.stream));
+    }
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      if (!ev.useSingleSync || iteration == ev.numIterations - 1)
+      {
+        HIP_CALL(hipEventSynchronize(link.stopEvent));
+        float gpuDeltaMsec;
+        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, link.startEvent, link.stopEvent));
+        link.totalTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (link.exeMemType == MEM_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    if (numa_run_on_node(link.exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", link.exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch child-threads to perform memcopies
+    for (int i = 0; i < ev.numCpuPerLink; i++)
+      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(link.blockParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < ev.numCpuPerLink; i++)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      link.totalTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
 }
diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp
index 1291bf68f7..11b0e851ca 100644
--- a/tools/TransferBench/TransferBench.hpp
+++ b/tools/TransferBench/TransferBench.hpp
@@ -25,6 +25,7 @@ THE SOFTWARE.
 #include <chrono>
 #include <cstdio>
 #include <cstdlib>
+#include <cstdint>
 #include <set>
 #include <unistd.h>
 #include <map>
@@ -33,7 +34,22 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
 #include <hsa/hsa_ext_amd.h>
-#include "copy_kernel.h"
+#include <hip/hcc_detail/hip_fp16.h>
+
+// Include common_kernel.h from RCCL for copy kernel
+// However define some variables to avoid extra includes / missing defines
+#define NCCL_DEVICE_H_   // Avoid loading devcomm.h
+#define WARP_SIZE 64
+
+typedef uint64_t PackType;
+typedef ulong2 Pack128;
+typedef struct
+{
+  uint16_t data;
+} rccl_bfloat16;
+
+#include "../../src/collectives/device/common_kernel.h"
+#include "EnvVars.hpp"
 
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
@@ -50,11 +66,12 @@ THE SOFTWARE.
 // Different src/dst memory types supported
 typedef enum
 {
-  MEM_CPU = 0,             // Pinned CPU memory
-  MEM_GPU = 1              // Global GPU memory
+  MEM_CPU      = 0,    // Pinned CPU memory
+  MEM_GPU      = 1,    // Coarse-grained global GPU memory
+  MEM_GPU_FINE = 2     // Fine-grained global GPU memory
 } MemType;
 
-char const MemTypeStr[3] = "CG";
+char const MemTypeStr[4] = "CGF";
 
 typedef enum
 {
@@ -62,17 +79,6 @@ typedef enum
   MODE_CHECK = 1          // Check data against pattern
 } ModeType;
 
-// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
-struct Link
-{
-  int     exeIndex;        // GPU to execute on
-  MemType srcMemType;      // Source memory type
-  int     srcIndex;        // Source device index
-  MemType dstMemType;      // Destination memory type
-  int     dstIndex;        // Destination device index
-  int     numBlocksToUse;  // Number of threadblocks to use for this Link
-};
-
 // Each threadblock copies N floats from src to dst
 struct BlockParam
 {
@@ -81,46 +87,98 @@ struct BlockParam
     float* dst;
 };
 
+// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
+struct Link
+{
+  // Link config
+  MemType exeMemType;      // Link executor type (CPU or GPU)
+  int     exeIndex;        // Executor index (NUMA node for CPU / device ID for GPU)
+  MemType srcMemType;      // Source memory type
+  int     srcIndex;        // Source device index
+  MemType dstMemType;      // Destination memory type
+  int     dstIndex;        // Destination device index
+  int     numBlocksToUse;  // Number of threadblocks to use for this Link
+
+  // Link implementation
+  float*      srcMem;      // Source memory
+  float*      dstMem;      // Destination memory
+
+  hipEvent_t  startEvent;
+  hipEvent_t  stopEvent;
+  hipStream_t stream;
+  BlockParam* blockParam;
+
+  double totalTime;
+};
+
 void DisplayUsage(char const* cmdName);                      // Display usage instructions
 void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
 void DisplayTopology();                                      // Display GPU topology
-void ParseLinks(char* line, std::vector<Link>& links);       // Parse Link information
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
+void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
+void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links);       // Parse Link information
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
 void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
+
+
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+std::string GetDesc(MemType srcMemType, int srcIndex,
+                    MemType dstMemType, int dstIndex);
 std::string GetLinkDesc(Link const& link);
 
-
-#define MAX_NAME_LEN 64
 #define BLOCKSIZE 256
 #define COPY_UNROLL 4
 #define MEMSET_UNROLL 4
 
+// Dummy reduction function (not used because it's just a copy)
+struct FuncNull {
+  __device__ float operator()(const float x, const float y) const {
+    return 0;
+  }
+};
+
 // GPU copy kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-CopyKernel(BlockParam* blockParams)
+GpuCopyKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
-    float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  const float* src[1] = {(float* )blockParams[blockIdx.x].src};
+  float* dst[1] = {(float* )blockParams[blockIdx.x].dst};
 
-    Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
+  ReduceOrCopyMulti<COPY_UNROLL, FuncNull, float, 1, 1, 1, 1>(
+    threadIdx.x, BLOCKSIZE, 1, src, 1, dst, N);
 }
 
 // GPU set kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-MemsetKernel(BlockParam* blockParams)
+GpuMemsetKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
 
-    // Use non-zero value
-    #pragma unroll MEMSET_UNROLL
-    for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
-    {
-      dst[tid] = 1234.0;
-    }
+  // Use non-zero value
+  #pragma unroll MEMSET_UNROLL
+  for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
+  {
+    dst[tid] = 1234.0;
+  }
+}
+
+// CPU copy kernel
+void CpuCopyKernel(BlockParam const& blockParams)
+{
+  memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
+}
+
+// CPU memset kernel
+void CpuMemsetKernel(BlockParam const& blockParams)
+{
+  for (int i = 0; i < blockParams.N; i++)
+    blockParams.dst[i] = 1234.0;
 }
diff --git a/tools/TransferBench/copy_kernel.h b/tools/TransferBench/copy_kernel.h
deleted file mode 100644
index e2377fdbb8..0000000000
--- a/tools/TransferBench/copy_kernel.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-#include <cstdio>
-#include <cstdint>
-
-// Define min for ssize_t
-static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
-
-typedef uint64_t PackType;
-
-template<class FUNC, typename T>
-struct MULTI {
-    __device__ PackType operator()(const PackType x, const PackType y) const
-    {
-        return FUNC()(x, y);
-    }
-};
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__attribute__((noinline))
-__device__ inline void ReduceCopy(
-    const int tid, const int nthreads,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int N) {
-  for (int idx = tid; idx < N; idx += nthreads) {
-    T val = vFetch(src0+idx);
-    if (TWO_INPUTS) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-    vStore(dest0+idx, val);
-    if (TWO_OUTPUTS) {
-      vStore(dest1+idx, val);
-    }
-  }
-}
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-template<typename T>
-struct FuncSum {
-  __device__ T operator()(const T x, const T y) const {
-    return x + y;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-
-typedef ulong2 Pack128;
-
-template<class FUNC, typename T>
-struct MULTI128 {
-  __device__ void operator()(Pack128& x, Pack128& y) {
-    x.x = MULTI<FUNC, T>()(x.x, y.x);
-    x.y = MULTI<FUNC, T>()(x.y, y.y);
-  }
-};
-
-inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
-  v.x = p->x;
-  v.y = p->y;
-}
-inline __device__ void Store128(Pack128* p, Pack128& v) {
-  p->x = v.x;
-  p->y = v.y;
-}
-
-template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
-    const int offset, const int N) {
-  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
-    T val = vFetch(srcs[0]+idx);
-    #pragma unroll
-    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
-    #pragma unroll 1
-    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
-
-    #pragma unroll
-    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
-    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
-  }
-}
-
-#define WARP_SIZE 64
-
-template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
-    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
-    const int elemOffset, const int Npack) {
-  const int inc = nw * UNROLL * WARP_SIZE;
-  int offset = w * UNROLL * WARP_SIZE + t;
-
-  const Pack128* srcs[MAXSRCS];
-  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
-  Pack128* dsts[MAXDSTS];
-  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
-
-  while (offset < Npack) {
-    Pack128 vals[UNROLL];
-    // Load and reduce
-    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
-
-    for (int i=1; i<MINSRCS; i++) {
-      Pack128 vals2[UNROLL];
-      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
-      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
-    }
-    #pragma unroll 1
-    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
-      Pack128 vals2[UNROLL];
-      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
-      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
-    }
-
-    // Store
-    for (int i = 0; i < MINDSTS; i++) {
-      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
-    }
-    #pragma unroll 1
-    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
-      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
-    }
-    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
-    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
-    offset += inc;
-  }
-}
-
-template <typename T>
-__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
-
-// Try to limit consecutive load/stores to 8.
-// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
-#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
-
-template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
-    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
-    int N) {
-  int Nrem = N;
-  if (Nrem <= 0) return;
-
-  int alignDiff = 0;
-  int align = ptrAlign128(srcs[0]);
-  #pragma unroll
-  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
-  #pragma unroll
-  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-
-  int Npreamble = alignDiff ? Nrem :
-    N < alignof(Pack128) ? N :
-    (alignof(Pack128) - align) % alignof(Pack128);
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  if (Npreamble) {
-    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
-    Nrem -= Npreamble;
-    if (Nrem == 0) return;
-  }
-  int offset = Npreamble;
-
-  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 128-bit alignable.
-  int w = tid / WARP_SIZE;       // Warp number
-  int nw = nthreads / WARP_SIZE; // Number of warps
-  int t = tid % WARP_SIZE;       // Thread (inside the warp)
-
-  const int packFactor = sizeof(Pack128) / sizeof(T);
-
-  // stage 2a: main loop
-  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
-      * (AUTOUNROLL * WARP_SIZE); // round down
-  int Nelem2a = Npack2a * packFactor;
-
-  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
-
-  Nrem -= Nelem2a;
-  if (Nrem == 0) return;
-  offset += Nelem2a;
-
-  // stage 2b: slightly less optimized for section when we don't have full
-  // unrolling
-
-  int Npack2b = Nrem / packFactor;
-  int Nelem2b = Npack2b * packFactor;
-
-  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
-
-  Nrem -= Nelem2b;
-  if (Nrem == 0) return;
-  offset += Nelem2b;
-
-  // stage 2c: tail
-  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  const T* srcs[2];
-  T* dsts[2];
-  srcs[0] = (const T*)src;
-  dsts[0] = (T*)dest;
-  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
-      1, srcs, 1, dsts, N);
-}
-
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  const T* srcs[2];
-  T* dsts[2];
-  srcs[0] = (const T*)src;
-  dsts[0] = (T*)dest0;
-  dsts[1] = (T*)dest1;
-  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
-      1, srcs, 2, dsts, N);
-}
-
-template<int UNROLL, int THREADS, typename T>
-__device__ void Reduce(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  const T* srcs[2];
-  T* dsts[2];
-  srcs[0] = (const T*)src0;
-  srcs[1] = (const T*)src1;
-  dsts[0] = (T*)dest;
-  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
-      2, srcs, 1, dsts, N);
-}
-
-template<int UNROLL, int THREADS, typename T>
-__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  const T* srcs[2];
-  T* dsts[2];
-  srcs[0] = (const T*)src0;
-  srcs[1] = (const T*)src1;
-  dsts[0] = (T*)dest0;
-  dsts[1] = (T*)dest1;
-  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
-      2, srcs, 2, dsts, N);
-}
-#endif // COPY_KERNEL_H_
diff --git a/tools/TransferBench/example.cfg b/tools/TransferBench/example.cfg
index b14762f9e3..2d546f433b 100644
--- a/tools/TransferBench/example.cfg
+++ b/tools/TransferBench/example.cfg
@@ -1,39 +1,43 @@
-#Configfile Format:
-#==================
-#A Link is defined as a uni-directional transfer from src memory location to dst memory location
-#Each single line in the configuration file defines a set of Links to run in parallel
+# Configfile Format:
+# ==================
+# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
+# Each single line in the configuration file defines a set of Links to run in parallel
 
-#There are two ways to specify the configuration file:
+# There are two ways to specify the configuration file:
 
-#1) Basic
-#   The basic specification assumes the same number of threadblocks/CUs used per link
-#   A positive number of Links is specified followed by that number of triplets describing each Link
+# 1) Basic
+#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
+#    A positive number of Links is specified followed by that number of triplets describing each Link
 
-   #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)
+#    #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
 
-#2) Advanced
-#   The advanced specification allows different number of threadblocks/CUs used per Link
-#   A negative number of links is specified, followed by quadruples describing each Link
-#   -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)
+# 2) Advanced
+#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
+#    A negative number of links is specified, followed by quadruples describing each Link
+#    -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
 
-#Argument Details:
-#  #Links  :   Number of Links to be run in parallel
-#  #CUs    :   Number of threadblocks/CUs to use for a Link
-#  GpuIndex:   0-indexed GPU id executing the Link
-#  srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
-#  dstMemL :   Destination memory location (Where the data is to be written to)
-#              Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)
-#              Supported memory locations are:
-#              - P:    Pinned host memory   (on CPU, on NUMA node closest to provided GPU index)
-#              - G:    Global device memory (on GPU)
-#Round brackets may be included for human clarity, but will be ignored
+# Argument Details:
+#   #Links  :   Number of Links to be run in parallel
+#   #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link
+#   srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)
+#               - C: CPU-executed  (Indexed from 0 to 1)
+#               - G: GPU-executed  (Indexed from 0 to 3)
+#   dstMemL :   Destination memory location (Where the data is to be written to)
 
-#Examples:
-#1 4 (0 G0 G1)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
-#1 4 (0 G1 G0)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0
-#1 4 (2 P0 G2)              Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
-#2 4 (0 G0 G1) (1 G1 G0)    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
-#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+#               Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
+#               Supported memory locations are:
+#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to 1)
+#               - G:    Global device memory     (on GPU device indexed from 0 to 3)
+#               - F:    Fine-grain device memory (on GPU device indexed from 0 to 3)
 
-# Single link between GPUs 0 and 1
-1 1 (0 G0 G1)
+# Examples:
+# 1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
+# 1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
+# 1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+
+# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
+1 4 (G0->G0->G1)

From 9aac1ed38fca9335f2c704c19cac96987292da2b Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Thu, 4 Feb 2021 19:05:03 +0000
Subject: [PATCH 13/25] [ib-test] Update for 2.8.3]

---
 tools/ib-test/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ib-test/Makefile b/tools/ib-test/Makefile
index 2cdd46e7e4..7035b4e112 100755
--- a/tools/ib-test/Makefile
+++ b/tools/ib-test/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC = $(HIP_PATH)/bin/hipcc
 
 EXE = ib_test
-CXXFLAGS = -g -O3 -Iinclude -I../../src/include -DENABLE_TRACE -DRCCL_IB_TEST -ldl -lnuma
+CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/clique -DENABLE_TRACE -DRCCL_IB_TEST -ldl -lnuma
 
 files = $(EXE).cpp utils.cpp ../../src/transport/net_ib.cc ../../src/misc/ibvwrap.cc ../../src/debug.cc
 

From 2f541508c50d4c0d195322b4025717f6cce888ef Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Thu, 4 Feb 2021 19:08:42 +0000
Subject: [PATCH 14/25] [topo_expl] Updating for 2.8.3

---
 tools/topo_expl/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile
index fcf838d232..0dee8416dd 100644
--- a/tools/topo_expl/Makefile
+++ b/tools/topo_expl/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC = $(HIP_PATH)/bin/hipcc
 
 EXE = topo_expl
-CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -lnuma
+CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -lnuma
 
 files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \
 	../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc

From f372c53d52f23f75e2d37592b3bd50acf05d3a7b Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Fri, 5 Feb 2021 16:46:20 +0000
Subject: [PATCH 15/25] [TransferBench] Fixing some merge issues

---
 tools/TransferBench/TransferBench.cpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index a433483424..a6ab78c46d 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -166,21 +166,12 @@ int main(int argc, char **argv)
         links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam));
       }
     }
-<<<<<<< HEAD
 
     // Loop over all the different number of bytes to use per Link
     for (auto N : valuesOfN)
     {
       if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
 
-=======
-
-    // Loop over all the different number of bytes to use per Link
-    for (auto N : valuesOfN)
-    {
-      if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
-
->>>>>>> origin/develop
       // Prepare links based on current N
       for (int i = 0; i < numLinks; i++)
       {
@@ -340,7 +331,6 @@ int main(int argc, char **argv)
       DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
 
       if (links[i].exeMemType == MEM_GPU)
-<<<<<<< HEAD
       {
         HIP_CALL(hipEventDestroy(links[i].startEvent));
         HIP_CALL(hipEventDestroy(links[i].stopEvent));
@@ -349,16 +339,6 @@ int main(int argc, char **argv)
       }
       else if (links[i].exeMemType == MEM_CPU)
       {
-=======
-      {
-        HIP_CALL(hipEventDestroy(links[i].startEvent));
-        HIP_CALL(hipEventDestroy(links[i].stopEvent));
-        HIP_CALL(hipStreamDestroy(links[i].stream));
-        HIP_CALL(hipFree(links[i].blockParam));
-      }
-      else if (links[i].exeMemType == MEM_CPU)
-      {
->>>>>>> origin/develop
         free(links[i].blockParam);
       }
     }

From 6dfdfef98f4f2c9790678a0bc714cee8b3c05685 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Sat, 6 Feb 2021 00:19:47 +0000
Subject: [PATCH 16/25] Add gfx908 Rome 4 NICs model

---
 tools/scripts/topo_val.sh                     |   2 +-
 tools/topo_expl/models/topo_8p_rome_4nics.xml | 126 ++++++++++++++++++
 tools/topo_expl/topo_expl.cpp                 |   2 +
 3 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/topo_expl/models/topo_8p_rome_4nics.xml

diff --git a/tools/scripts/topo_val.sh b/tools/scripts/topo_val.sh
index fe11d8b26e..07d8d050d5 100755
--- a/tools/scripts/topo_val.sh
+++ b/tools/scripts/topo_val.sh
@@ -21,7 +21,7 @@
 
 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-for i in {0..47}
+for i in {0..49}
 do
 	$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
 	$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
diff --git a/tools/topo_expl/models/topo_8p_rome_4nics.xml b/tools/topo_expl/models/topo_8p_rome_4nics.xml
new file mode 100644
index 0000000000..8129927d71
--- /dev/null
+++ b/tools/topo_expl/models/topo_8p_rome_4nics.xml
@@ -0,0 +1,126 @@
+<system version="2">
+  <cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff,00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:43:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:46:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:48:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:4a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
+              <xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:4c:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:4e:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:50:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
+              <xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:45:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_1" dev="0" speed="200000" port="1" guid="0x48b9170003a1420c" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000,00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:03:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:05:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:08:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:0a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
+              <xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:0b:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:0d:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:0f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
+              <xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:13:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_3" dev="1" speed="200000" port="1" guid="0x18604a0003a1420c" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000,00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c7:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:c9:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:cb:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
+              <xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:cd:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:cf:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:d1:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
+              <xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:c6:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_5" dev="2" speed="200000" port="1" guid="0xd0b9170003a1420c" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000,ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:83:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:86:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:88:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:8a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
+              <xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:8c:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:8e:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+          <pci busid="0000:90:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+            <gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
+              <xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
+              <xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:85:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_7" dev="3" speed="200000" port="1" guid="0xd0bd170003a1420c" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp
index db9181996c..77b0597e51 100644
--- a/tools/topo_expl/topo_expl.cpp
+++ b/tools/topo_expl/topo_expl.cpp
@@ -117,6 +117,8 @@ NodeModelDesc model_descs[] = {
   {1, "topo_8p_rome_n4_1.xml",  "single node 8 gfx908 Rome NPS=4"},
   {4, "topo_8p_rome_n4_1.xml",  "4 nodes node 8 gfx908 Rome NPS=4"},
   {2, "topo_8p_rome_pcie.xml",  "2 nodes node 8 VEGA20 PCIe"},
+  {1, "topo_8p_rome_4nics.xml", "single node 8 gfx908 Rome 4 NICs"},
+  {4, "topo_8p_rome_4nics.xml", "4 nodes node 8 gfx908 Rome 4 NICs"},
 };
 
 int main(int argc,char* argv[])

From d00b7d17bd80bbbbc601ac0b32777a070c198834 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Fri, 5 Feb 2021 17:49:25 -0700
Subject: [PATCH 17/25] Update MP UT to support arbitrary # of GPUs; multiple
 bugfixes (#16)

* Fixing temp file creation/deletion for Clique kernel mode.

* Refactoring of MP unit tests; include bugfixes and general support for any number of GPUs

* GroupCall MP UT properly quits when too many devices specified

* MP UT will programmatically set NCCL_COMM_ID if not specified; updated install script
---
 install.sh                              |   4 +-
 src/clique/CliqueManager.cc             |  47 +++++----
 test/CorrectnessTest.hpp                |  76 +++++++++++---
 test/test_AllGatherMultiProcess.cpp     |  62 +++--------
 test/test_AllGatherMultiProcess.hpp     |  42 +++++---
 test/test_AllReduceMultiProcess.cpp     |  62 +++--------
 test/test_AllReduceMultiProcess.hpp     |  32 ++++--
 test/test_AllToAllMultiProcess.cpp      |  61 +++--------
 test/test_AllToAllMultiProcess.hpp      |  24 +++--
 test/test_BroadcastMultiProcess.cpp     |  62 +++--------
 test/test_BroadcastMultiProcess.hpp     |  33 +++---
 test/test_CombinedCallsMultiProcess.cpp |  60 +++--------
 test/test_CombinedCallsMultiProcess.hpp |  41 ++++++--
 test/test_GatherMultiProcess.cpp        |  62 +++--------
 test/test_GatherMultiProcess.hpp        |  10 +-
 test/test_GroupCallsMultiProcess.cpp    |  78 ++++----------
 test/test_GroupCallsMultiProcess.hpp    |  48 ++++++---
 test/test_ReduceMultiProcess.cpp        |  62 +++--------
 test/test_ReduceMultiProcess.hpp        | 122 ++++++++++++----------
 test/test_ReduceScatterMultiProcess.cpp |  62 +++--------
 test/test_ReduceScatterMultiProcess.hpp | 132 ++++++++++++++----------
 test/test_ScatterMultiProcess.cpp       |  62 +++--------
 test/test_ScatterMultiProcess.hpp       |  10 +-
 23 files changed, 538 insertions(+), 716 deletions(-)

diff --git a/install.sh b/install.sh
index 9c95219e31..fd495513b0 100755
--- a/install.sh
+++ b/install.sh
@@ -200,10 +200,10 @@ if ($run_tests); then
     if (test -f "./test/UnitTests"); then
         if ($run_tests_all); then
             ./test/UnitTests
-            NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess
+            ./test/UnitTestsMultiProcess
         else
             ./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
-            NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
+            ./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
         fi
     else
         echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc
index d99f332dee..06dc8f539c 100644
--- a/src/clique/CliqueManager.cc
+++ b/src/clique/CliqueManager.cc
@@ -40,6 +40,7 @@ THE SOFTWARE.
 #include <iostream>
 #include <sstream>
 #include <thread>
+#include <unistd.h>
 
 cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS]  = {};
 int*               CliqueManager::m_staticGpuBarrierMem             = NULL;
@@ -502,28 +503,36 @@ void CliqueManager::WaitForBarrier()
 
 ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
 {
-  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  if (rcclParamEnableClique())
   {
-    int msgid, fd;
-    std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
-    SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
-    NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
-    SYSCHECK(close(fd), "close");
+      for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+      {
+        int msgid, fd;
+        std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
+        SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
+        NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
+        SYSCHECK(unlink(msgQueueName.c_str()), "unlink");
+        SYSCHECK(close(fd), "close");
+      }
+
+      std::string shmDir = "/dev/shm/";
+
+      for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+      {
+        struct stat fileStatus;
+        std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
+        std::string shmFullPath = shmDir + shmFileName;
+
+        // Check if shm file already exists; if so, unlink it
+        if (stat(shmFullPath.c_str(), &fileStatus) == 0)
+        {
+          NCCLCHECK(shmUnlink(shmFileName.c_str()));
+        }
+      }
   }
-
-  std::string shmDir = "/dev/shm/";
-
-  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  else
   {
-    struct stat fileStatus;
-    std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
-    std::string shmFullPath = shmDir + shmFileName;
-
-    // Check if shm file already exists; if so, unlink it
-    if (stat(shmFullPath.c_str(), &fileStatus) == 0)
-    {
-      NCCLCHECK(shmUnlink(shmFileName.c_str()));
-    }
+    INFO(NCCL_INIT, "Not performing bootstrap root for clique kernels as clique mode not enabled.");
   }
   return ncclSuccess;
 }
diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp
index b440db9d94..fd7187a810 100644
--- a/test/CorrectnessTest.hpp
+++ b/test/CorrectnessTest.hpp
@@ -702,13 +702,18 @@ namespace CorrectnessTests
     class MultiProcessCorrectnessTest : public CorrectnessTest
     {
     protected:
+        // IMPORTANT: We cannot have any HIP API calls in the parent process.
+        // Do any HIP setup in SetupPerProcess().
         void SetUp() override
         {
-            // Check for NCCL_COMM_ID env variable (otherwise will not init)
+            // Check if NCCL_COMM_ID is already set; if not, set it now
             if (!getenv("NCCL_COMM_ID"))
             {
-                printf("Must set NCCL_COMM_ID prior to execution\n");
-                exit(0);
+                char hostname[HOST_NAME_MAX+1];
+                gethostname(hostname, HOST_NAME_MAX+1);
+                std::string hostnameString(hostname);
+                hostnameString.append(":55513");
+                setenv("NCCL_COMM_ID", hostnameString.c_str(), 0);
             }
 
             // Make the test tuple parameters accessible
@@ -737,10 +742,14 @@ namespace CorrectnessTests
 
             comms.resize(numDevices);
             streams.resize(numDevices);
+            dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
+            Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
         }
 
         void TearDown() override
         {
+            munmap(dataset, sizeof(Dataset));
+
             // Restore env vars after tests
             for (int i = 0; i < numTokens/2; i++) {
                 if (savedEnv[i]) {
@@ -773,9 +782,11 @@ namespace CorrectnessTests
             // Only proceed with testing if there are enough GPUs
             if (numDevices > numDevicesAvailable)
             {
-                fprintf(stdout, "[  SKIPPED ] Test requires %d devices (only %d available)\n",
-                        numDevices, numDevicesAvailable);
-
+                if (rank == 0)
+                {
+                    fprintf(stdout, "[  SKIPPED ] Test requires %d devices (only %d available)\n",
+                            numDevices, numDevicesAvailable);
+                }
                 // Modify the number of devices so that tear-down doesn't occur
                 // This is temporary until GTEST_SKIP() becomes available
                 numDevices = 0;
@@ -795,7 +806,7 @@ namespace CorrectnessTests
             if (res != ncclSuccess)
             {
                 printf("Test failure:%s %d '%s' numRanks:%d\n", __FILE__,__LINE__,ncclGetErrorString(res), numDevices);
-                ASSERT_EQ(res, hipSuccess);
+                ASSERT_EQ(res, ncclSuccess);
             }
         }
 
@@ -803,17 +814,22 @@ namespace CorrectnessTests
         void SetUpPerProcess(int rank, ncclFunc_t const func, ncclComm_t& comm, hipStream_t& stream, Dataset& dataset)
         {
             SetUpPerProcessHelper(rank, comm, stream);
-            dataset.Initialize(numDevices, numElements, dataType, inPlace, func, rank);
+            if (numDevices <= numDevicesAvailable)
+            {
+                dataset.Initialize(numDevices, numElements, dataType, inPlace, func, rank);
+            }
         }
 
         // To be called by each process/rank individually (see GroupCallsMultiProcess)
         void SetUpPerProcess(int rank, std::vector<ncclFunc_t> const& func, ncclComm_t& comm, hipStream_t& stream, std::vector<Dataset*>& datasets)
         {
             SetUpPerProcessHelper(rank, comm, stream);
-
-            for (int i = 0; i < datasets.size(); i++)
+            if (numDevices <= numDevicesAvailable)
             {
-                datasets[i]->Initialize(numDevices, numElements, dataType, inPlace, func[i], rank);
+                for (int i = 0; i < datasets.size(); i++)
+                {
+                    datasets[i]->Initialize(numDevices, numElements, dataType, inPlace, func[i], rank);
+                }
             }
         }
 
@@ -875,7 +891,7 @@ namespace CorrectnessTests
             free(arrayI1);
         }
 
-        void ValidateResults(Dataset const& dataset, int rank, int root = 0) const
+        bool ValidateResults(Dataset const& dataset, int rank, int root = 0) const
         {
             int8_t*   outputI1 = (int8_t   *)malloc(dataset.NumBytes(ncclOutputBuffer));
             uint8_t*  outputU1 = (uint8_t  *)outputI1;
@@ -894,8 +910,11 @@ namespace CorrectnessTests
 
             // only output on root rank is valid for gather collective
             if (dataset.function == ncclCollGather && rank != root)
-                return;
-            HIP_CALL(hipMemcpy(outputI1, dataset.outputs[rank], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost));
+                return true;
+
+            hipError_t err = hipMemcpy(outputI1, dataset.outputs[rank], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost);
+            if (err != hipSuccess)
+                return false;
 
             int8_t*   expectedI1 = (int8_t   *)dataset.expected[rank];
             uint8_t*  expectedU1 = (uint8_t  *)expectedI1;
@@ -953,8 +972,35 @@ namespace CorrectnessTests
                     }
                 }
             }
-            ASSERT_EQ(isMatch, true);
+            return isMatch;
         }
+
+        void ValidateProcesses(std::vector<int> const& pids)
+        {
+            int numProcesses = pids.size();
+            int status[numProcesses];
+            for (int i = 0; i < numProcesses; i++)
+            {
+                waitpid(pids[i], &status[i], 0);
+
+                ASSERT_NE(WIFEXITED(status[i]), 0) << "[ERROR] Child process " << i << " did not exit cleanly.";
+                ASSERT_EQ(WEXITSTATUS(status[i]), EXIT_SUCCESS) << "[ERROR] Child process " << i << " had a test failure.";
+            }
+        }
+
+        void TerminateChildProcess(bool const pass)
+        {
+            if (pass)
+            {
+                exit(EXIT_SUCCESS);
+            }
+            else
+            {
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        Dataset* dataset;
     };
 
     std::string GenerateTestNameString(testing::TestParamInfo<MultiProcessCorrectnessTest::ParamType>& info);
diff --git a/test/test_AllGatherMultiProcess.cpp b/test/test_AllGatherMultiProcess.cpp
index 370f12cc71..1a41298485 100644
--- a/test/test_AllGatherMultiProcess.cpp
+++ b/test/test_AllGatherMultiProcess.cpp
@@ -9,63 +9,27 @@ namespace CorrectnessTests
 {
     TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestAllGather(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestAllGather(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestAllGather(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestAllGather(3, *dataset);
-                exit(0);
+                bool pass;
+                TestAllGather(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep,
@@ -87,7 +51,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(3072, 3145728),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_AllGatherMultiProcess.hpp b/test/test_AllGatherMultiProcess.hpp
index b881d4b7b9..46e4504d5d 100644
--- a/test/test_AllGatherMultiProcess.hpp
+++ b/test/test_AllGatherMultiProcess.hpp
@@ -13,38 +13,50 @@ namespace CorrectnessTests
     class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const rank, int const numDevices)
+        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector<int> const& ranks)
         {
             size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
 
-            HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
-                               byteCount, hipMemcpyDeviceToHost));
-
-            barrier.Wait();
-            // Rank 0 sends answer to other ranks
-            if (rank == 0)
+            for (int i = 0; i < ranks.size(); i++)
             {
-                for (int i = 0; i < dataset.numDevices; i++)
+                int rank = ranks[i];
+                HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
+                                   byteCount, hipMemcpyDeviceToHost));
+            }
+            barrier.Wait();
+
+            // Rank 0 sends answer to other ranks
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                if (rank == 0)
                 {
-                    if (i == rank) continue;
-                    memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
+                    for (int i = 0; i < dataset.numDevices; i++)
+                    {
+                        if (i == rank) continue;
+                        memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
+                    }
                 }
             }
         }
 
-        void TestAllGather(int rank, Dataset& dataset)
+        void TestAllGather(int rank, Dataset& dataset, bool& pass)
         {
             // Prepare input / output / expected results
             SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
-            if (numElements % numDevices != 0) return;
+            if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
             // Prepare input / output / expected results
             FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, barrier, rank, numDevices);
+
+            ComputeExpectedResults(dataset, barrier, numDevices, std::vector<int>(1, rank));
 
             size_t const byteCount = dataset.NumBytes() / numDevices;
             size_t const sendCount = dataset.numElements / numDevices;
@@ -58,7 +70,7 @@ namespace CorrectnessTests
             HIP_CALL(hipStreamSynchronize(streams[rank]));
 
             // Check results
-            ValidateResults(dataset, rank);
+            pass = ValidateResults(dataset, rank);
 
             TearDownPerProcess(comms[rank], streams[rank]);
             dataset.Release(rank);
diff --git a/test/test_AllReduceMultiProcess.cpp b/test/test_AllReduceMultiProcess.cpp
index 2a5500e629..34ea532f40 100644
--- a/test/test_AllReduceMultiProcess.cpp
+++ b/test/test_AllReduceMultiProcess.cpp
@@ -10,63 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestAllReduce(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestAllReduce(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestAllReduce(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestAllReduce(3, *dataset);
-                exit(0);
+                bool pass;
+                TestAllReduce(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_AllReduceMultiProcess.hpp b/test/test_AllReduceMultiProcess.hpp
index 2aa8e27087..f7c00de221 100644
--- a/test/test_AllReduceMultiProcess.hpp
+++ b/test/test_AllReduceMultiProcess.hpp
@@ -13,13 +13,17 @@ namespace CorrectnessTests
     class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const rank)
+        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
         {
             // Copy all inputs to expected arrays temporarily to perform reduction on host
-            HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
-
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
             barrier.Wait();
+
             // Allocate temporary host array to accumulate results
             int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
             uint8_t*  resultU1 = (uint8_t  *)resultI1;
@@ -68,23 +72,31 @@ namespace CorrectnessTests
                 }
             }
             barrier.Wait();
-            // Copy results into expected array
-            memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
 
+            // Copy results into expected array
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
+            }
             free(resultI1);
         }
 
-        void TestAllReduce(int rank, Dataset& dataset)
+        void TestAllReduce(int rank, Dataset& dataset, bool& pass)
         {
             SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
             // Prepare input / output / expected results
             FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, barrier, op, rank);
+            ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
 
             // Launch the reduction
             ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank],
@@ -94,7 +106,7 @@ namespace CorrectnessTests
             HIP_CALL(hipStreamSynchronize(streams[rank]));
 
             // Check results
-            ValidateResults(dataset, rank);
+            pass = ValidateResults(dataset, rank);
 
             TearDownPerProcess(comms[rank], streams[rank]);
             dataset.Release(rank);
diff --git a/test/test_AllToAllMultiProcess.cpp b/test/test_AllToAllMultiProcess.cpp
index e607861833..842c1ee9e7 100644
--- a/test/test_AllToAllMultiProcess.cpp
+++ b/test/test_AllToAllMultiProcess.cpp
@@ -10,62 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestAllToAll(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestAllToAll(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestAllToAll(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestAllToAll(3, *dataset);
-                exit(0);
+                bool pass;
+                TestAllToAll(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep,
@@ -87,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
                                 testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
diff --git a/test/test_AllToAllMultiProcess.hpp b/test/test_AllToAllMultiProcess.hpp
index 8fbd56f8a0..76205608c9 100644
--- a/test/test_AllToAllMultiProcess.hpp
+++ b/test/test_AllToAllMultiProcess.hpp
@@ -13,24 +13,32 @@ namespace CorrectnessTests
     class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, int const rank)
+        static void ComputeExpectedResults(Dataset& dataset, std::vector<int> const& ranks)
         {
-            for (int i = 0; i < dataset.numDevices; i++)
+            for (int i = 0; i < ranks.size(); i++)
             {
-                HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*i,
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
+                int rank = ranks[i];
+                for (int j = 0; j < dataset.numDevices; j++)
+                {
+                    HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j,
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+                }
             }
         }
 
-        void TestAllToAll(int rank, Dataset& dataset)
+        void TestAllToAll(int rank, Dataset& dataset, bool& pass)
         {
             SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             // Prepare input / output / expected results
             FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, rank);
+            ComputeExpectedResults(dataset, std::vector<int>(1, rank));
 
             // Launch the reduction
             ncclAllToAll(dataset.inputs[rank],
@@ -42,7 +50,7 @@ namespace CorrectnessTests
             HIP_CALL(hipStreamSynchronize(streams[rank]));
 
             // Check results
-            ValidateResults(dataset, rank);
+            pass = ValidateResults(dataset, rank);
 
             TearDownPerProcess(comms[rank], streams[rank]);
             dataset.Release(rank);
diff --git a/test/test_BroadcastMultiProcess.cpp b/test/test_BroadcastMultiProcess.cpp
index 1d43c6440a..793f118eda 100644
--- a/test/test_BroadcastMultiProcess.cpp
+++ b/test/test_BroadcastMultiProcess.cpp
@@ -17,63 +17,27 @@ namespace CorrectnessTests
 {
     TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestBroadcast(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestBroadcast(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestBroadcast(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestBroadcast(3, *dataset);
-                exit(0);
+                bool pass;
+                TestBroadcast(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep,
@@ -95,7 +59,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_BroadcastMultiProcess.hpp b/test/test_BroadcastMultiProcess.hpp
index 0075faeeb7..1b5a36e57c 100644
--- a/test/test_BroadcastMultiProcess.hpp
+++ b/test/test_BroadcastMultiProcess.hpp
@@ -13,26 +13,35 @@ namespace CorrectnessTests
     class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
+        static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector<int> const& ranks)
         {
-            // Root has the answer; share it via host memcpy's
-            if (rank == root)
+            for (int h = 0; h < ranks.size(); h++)
             {
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-                for (int i = 0; i < dataset.numDevices; i++)
+                int rank = ranks[h];
+                // Root has the answer; share it via host memcpy's
+                if (rank == root)
                 {
-                    if (i == rank) continue;
-                    memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
+                    HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
+                                       dataset.NumBytes(), hipMemcpyDeviceToHost));
+                    for (int i = 0; i < dataset.numDevices; i++)
+                    {
+                        if (i == rank) continue;
+                        memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
+                    }
+                    break;
                 }
             }
         }
 
-        void TestBroadcast(int rank, Dataset& dataset)
+        void TestBroadcast(int rank, Dataset& dataset, bool& pass)
         {
             SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
@@ -41,7 +50,7 @@ namespace CorrectnessTests
             {
                 // Prepare input / output / expected results
                 FillDatasetWithPattern(dataset, rank);
-                ComputeExpectedResults(dataset, root, rank);
+                ComputeExpectedResults(dataset, root, std::vector<int>(1, rank));
 
                 // Launch the reduction (1 process per GPU)
                 ncclResult_t res = ncclBroadcast(dataset.inputs[rank],
@@ -53,7 +62,7 @@ namespace CorrectnessTests
                 HIP_CALL(hipStreamSynchronize(streams[rank]));
 
                 // Check results
-                ValidateResults(dataset, rank);
+                pass = ValidateResults(dataset, rank);
 
                 // Ensure all processes have finished current iteration before proceeding
                 barrier.Wait();
diff --git a/test/test_CombinedCallsMultiProcess.cpp b/test/test_CombinedCallsMultiProcess.cpp
index f7a4f10b4e..9f030f3fcd 100644
--- a/test/test_CombinedCallsMultiProcess.cpp
+++ b/test/test_CombinedCallsMultiProcess.cpp
@@ -25,60 +25,28 @@ namespace CorrectnessTests
             datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
             datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
         }
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
+        std::vector<int> pids(numDevices);
 
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestCombinedCalls(0, datasets, ncclFuncs);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestCombinedCalls(1, datasets, ncclFuncs);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestCombinedCalls(2, datasets, ncclFuncs);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestCombinedCalls(3, datasets, ncclFuncs);
-                exit(0);
+                bool pass;
+                TestCombinedCalls(gpu, datasets, ncclFuncs, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
+
+        ValidateProcesses(pids);
+
         for (int i = 0; i < datasets.size(); i++)
         {
             munmap(datasets[i], sizeof(Dataset));
@@ -104,7 +72,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(3072, 3145728),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_CombinedCallsMultiProcess.hpp b/test/test_CombinedCallsMultiProcess.hpp
index e48902514f..9b7ba6bf8b 100644
--- a/test/test_CombinedCallsMultiProcess.hpp
+++ b/test/test_CombinedCallsMultiProcess.hpp
@@ -20,21 +20,26 @@ namespace CorrectnessTests
     class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs)
+        void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
         {
             SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
             // Compute expected results for each dataset in combined
             int const root = 0;
-            AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, rank, numDevices);
-            AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, rank);
-            BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, rank);
-            ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, rank);
-            ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, rank);
+            std::vector<int> ranks(1, rank);
+            AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
+            AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
+            BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
+            ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
+            ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
 
             size_t const byteCount = datasets[0]->NumBytes() / numDevices;
             size_t const elemCount = numElements / numDevices;
@@ -64,12 +69,26 @@ namespace CorrectnessTests
             // Wait for reduction to complete
             HIP_CALL(hipStreamSynchronize(streams[rank]));
 
-            // Check results for each collective in the combined
-            for (int i = 0; i < 5; i++)
+            for (int i = 0; i < funcs.size(); i++)
             {
-                ValidateResults(*datasets[i], rank);
+                for (int j = 0; j < ranks.size(); j++)
+                {
+                    pass = ValidateResults(*datasets[i], ranks[j], root);
+                    if (!pass)
+                    {
+                        break;
+                    }
+                }
                 barrier.Wait();
-                datasets[i]->Release(rank);
+                for (int j = 0; j < ranks.size(); j++)
+                {
+                    datasets[i]->Release(ranks[j]);
+                }
+            }
+
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
             }
         }
     };
diff --git a/test/test_GatherMultiProcess.cpp b/test/test_GatherMultiProcess.cpp
index 7627a15e89..2ba477e168 100644
--- a/test/test_GatherMultiProcess.cpp
+++ b/test/test_GatherMultiProcess.cpp
@@ -10,63 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(GatherMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestGather(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestGather(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestGather(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestGather(3, *dataset);
-                exit(0);
+                bool pass;
+                TestGather(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
                                 testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
diff --git a/test/test_GatherMultiProcess.hpp b/test/test_GatherMultiProcess.hpp
index e26c7f319b..0f05e7bef8 100644
--- a/test/test_GatherMultiProcess.hpp
+++ b/test/test_GatherMultiProcess.hpp
@@ -19,11 +19,15 @@ namespace CorrectnessTests
                                dataset.NumBytes(), hipMemcpyDeviceToHost));
         }
 
-        void TestGather(int rank, Dataset& dataset)
+        void TestGather(int rank, Dataset& dataset, bool& pass)
         {
             SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
@@ -44,7 +48,7 @@ namespace CorrectnessTests
                 HIP_CALL(hipStreamSynchronize(streams[rank]));
 
                 // Check results
-                ValidateResults(dataset, rank, root);
+                pass = ValidateResults(dataset, rank, root);
 
                 // Ensure all processes have finished current iteration before proceeding
                 barrier.Wait();
diff --git a/test/test_GroupCallsMultiProcess.cpp b/test/test_GroupCallsMultiProcess.cpp
index ce7b7ecfa7..b6f8c3fce3 100644
--- a/test/test_GroupCallsMultiProcess.cpp
+++ b/test/test_GroupCallsMultiProcess.cpp
@@ -25,75 +25,39 @@ namespace CorrectnessTests
             datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
             datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
         }
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
+        int const numGpusPerProcess = 2;
+        int const numProcesses = numDevices / numGpusPerProcess;
+        std::vector<int> pids(numProcesses);
+        int process = -1;
 
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        for (int i = 0; i < numDevices; i+= numGpusPerProcess)
         {
-            pid2 = fork();
-            if (numDevices > 4)
+            process++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 4)  || (pid2 > 0 && pid3 > 0 && numDevices > 4))
-            {
-                // Process 0
-                std::vector<int> ranks;
-                ranks.push_back(0);
-                ranks.push_back(1);
+                int gpuIdx = i;
+                int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
 
-                TestGroupCalls(0, ranks, datasets, ncclFuncs);
-                if (pid3 > 0)
+                std::vector<int> ranks;
+                for (; gpuIdx < maxIdx; gpuIdx++)
                 {
-                    waitpid(pid3, NULL, 0);
+                    ranks.push_back(gpuIdx);
                 }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 4) || (pid2 == 0 && pid3 > 0 && numDevices > 4))
-            {
-                // Process 1
-                std::vector<int> ranks;
-                ranks.push_back(2);
-                ranks.push_back(3);
-                TestGroupCalls(1, ranks, datasets, ncclFuncs);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices == 8)
-            {
-                // Process 2 (available when numDevices == 8)
-                std::vector<int> ranks;
-                ranks.push_back(4);
-                ranks.push_back(5);
 
-                TestGroupCalls(2, ranks, datasets, ncclFuncs);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 8)
-            {
-                // Process 3 (available when numDevices == 8)
-                std::vector<int> ranks;
-                ranks.push_back(6);
-                ranks.push_back(7);
-
-                TestGroupCalls(3, ranks, datasets, ncclFuncs);
-                exit(0);
+                bool pass;
+                TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[process] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
+
+        ValidateProcesses(pids);
+
         for (int i = 0; i < datasets.size(); i++)
         {
             munmap(datasets[i], sizeof(Dataset));
@@ -119,7 +83,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(3072, 3145728),
                                 // Number of devices
-                                testing::Values(4),
+                                testing::Values(4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_GroupCallsMultiProcess.hpp b/test/test_GroupCallsMultiProcess.hpp
index bc38688923..8942ba0b1e 100644
--- a/test/test_GroupCallsMultiProcess.hpp
+++ b/test/test_GroupCallsMultiProcess.hpp
@@ -21,34 +21,50 @@ namespace CorrectnessTests
     class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs)
+        void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
         {
-            if (numDevices > numDevicesAvailable) return;
-
+            ncclGroupStart();
             for (int i = 0; i < ranks.size(); i++)
             {
                 SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
+                if (numDevices > numDevicesAvailable)
+                {
+                    break;
+                }
+            }
+            ncclGroupEnd();
+
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
             }
 
             int numProcesses = numDevices / ranks.size();
             Barrier barrier(process, numProcesses, std::atoi(getenv("NCCL_COMM_ID")));
 
-            int const root = 0;
             for (int i = 0; i < ranks.size(); i++)
             {
-                AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks[i]);
-                AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks[i]);
-                BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks[i]);
-                ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks[i]);
-                ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks[i]);
+                for (int j = 0; j < datasets.size(); j++)
+                {
+                    FillDatasetWithPattern(*datasets[j], ranks[i]);
+                }
             }
+
+            int const root = 0;
+
+            AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
+            AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
+            BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
+            ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
+            ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
             barrier.Wait();
 
-            ncclGroupStart();
-
-            // AllGather
             size_t const byteCount = datasets[0]->NumBytes() / numDevices;
             size_t const elemCount = numElements / numDevices;
+
+            ncclGroupStart();
+            // AllGather
             for (int i = 0; i < ranks.size(); i++)
             {
                 int rank = ranks[i];
@@ -90,7 +106,7 @@ namespace CorrectnessTests
             {
                 int rank = ranks[i];
                 ncclReduceScatter(datasets[4]->inputs[rank],
-                                  (int8_t *)datasets[4]->outputs[rank] + (i * byteCount),
+                                  (int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
                                   elemCount, dataType, op,
                                   comms[rank], streams[rank]);
             }
@@ -108,7 +124,11 @@ namespace CorrectnessTests
             {
                 for (int j = 0; j < ranks.size(); j++)
                 {
-                    ValidateResults(*datasets[i], ranks[j]);
+                    pass = ValidateResults(*datasets[i], ranks[j], root);
+                    if (!pass)
+                    {
+                        break;
+                    }
                 }
                 barrier.Wait();
                 for (int j = 0; j < ranks.size(); j++)
diff --git a/test/test_ReduceMultiProcess.cpp b/test/test_ReduceMultiProcess.cpp
index bf495d382a..7ac0462a98 100644
--- a/test/test_ReduceMultiProcess.cpp
+++ b/test/test_ReduceMultiProcess.cpp
@@ -10,63 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(ReduceMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestReduce(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestReduce(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestReduce(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestReduce(3, *dataset);
-                exit(0);
+                bool pass;
+                TestReduce(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_ReduceMultiProcess.hpp b/test/test_ReduceMultiProcess.hpp
index 65977608b8..173cb241b9 100644
--- a/test/test_ReduceMultiProcess.hpp
+++ b/test/test_ReduceMultiProcess.hpp
@@ -13,77 +13,93 @@ namespace CorrectnessTests
     class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, int const rank)
+        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector<int> const& ranks)
         {
             // Copy all inputs to expected arrays temporarily to perform reduction on host
-            HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
             barrier.Wait();
 
-            if (rank == root)
+            for (int h = 0; h < ranks.size(); h++)
             {
-                // Allocate temporary host array to accumulate results
-                int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-                uint8_t*  resultU1 = (uint8_t  *)resultI1;
-                int32_t*  resultI4 = (int32_t  *)resultI1;
-                uint32_t* resultU4 = (uint32_t *)resultI1;
-                int64_t*  resultI8 = (int64_t  *)resultI1;
-                uint64_t* resultU8 = (uint64_t *)resultI1;
-                float*    resultF4 = (float    *)resultI1;
-                double*   resultF8 = (double   *)resultI1;
-                rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-                // Initialize the result with the first device's array
-                memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-
-                // Perform reduction on the other device arrays
-                for (int i = 1; i < dataset.numDevices; i++)
+                int rank = ranks[h];
+                if (rank == root)
                 {
-                    int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                    uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                    int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                    uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                    int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                    uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                    float*    arrayF4 = (float    *)arrayI1;
-                    double*   arrayF8 = (double   *)arrayI1;
-                    rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
+                    // Allocate temporary host array to accumulate results
+                    int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+                    uint8_t*  resultU1 = (uint8_t  *)resultI1;
+                    int32_t*  resultI4 = (int32_t  *)resultI1;
+                    uint32_t* resultU4 = (uint32_t *)resultI1;
+                    int64_t*  resultI8 = (int64_t  *)resultI1;
+                    uint64_t* resultU8 = (uint64_t *)resultI1;
+                    float*    resultF4 = (float    *)resultI1;
+                    double*   resultF8 = (double   *)resultI1;
+                    rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
 
-                    for (int j = 0; j < dataset.numElements; j++)
+                    // Initialize the result with the first device's array
+                    memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+                    // Perform reduction on the other device arrays
+                    for (int i = 1; i < dataset.numDevices; i++)
                     {
-                        switch (dataset.dataType)
+                        int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                        uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                        int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                        uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                        int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                        uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                        float*    arrayF4 = (float    *)arrayI1;
+                        double*   arrayF8 = (double   *)arrayI1;
+                        rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
+
+                        for (int j = 0; j < dataset.numElements; j++)
                         {
-                        case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                        case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                        case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                        case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                        case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                        case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                        case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                        case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                        case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                        default:
-                            fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                            exit(0);
+                            switch (dataset.dataType)
+                            {
+                            case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                            case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                            case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                            case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                            case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                            case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                            case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                            case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                            case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
+                            default:
+                                fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                                exit(0);
+                            }
                         }
                     }
+                    memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
+                    free(resultI1);
                 }
-                memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
-                free(resultI1);
-                barrier.Wait();
             }
-            else
+            barrier.Wait();
+
+            for (int i = 0; i < ranks.size(); i++)
             {
-                barrier.Wait();
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
+                int rank = ranks[i];
+                if (rank != root)
+                {
+                    HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
+                }
             }
         }
 
-        void TestReduce(int rank, Dataset& dataset)
+        void TestReduce(int rank, Dataset& dataset, bool& pass)
         {
             SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
@@ -92,7 +108,7 @@ namespace CorrectnessTests
             {
                 // Prepare input / output / expected results
                 FillDatasetWithPattern(dataset, rank);
-                ComputeExpectedResults(dataset, barrier, op, root, rank);
+                ComputeExpectedResults(dataset, barrier, op, root, std::vector<int>(1, rank));
                 // Launch the reduction (1 process per GPU)
                 ncclResult_t res = ncclReduce(dataset.inputs[rank],
                                               dataset.outputs[rank],
@@ -101,7 +117,7 @@ namespace CorrectnessTests
                 // Wait for reduction to complete
                 HIP_CALL(hipStreamSynchronize(streams[rank]));
                 // Check results
-                ValidateResults(dataset, rank);
+                pass = ValidateResults(dataset, rank);
                 // Ensure all processes have finished current iteration before proceeding
                 barrier.Wait();
             }
diff --git a/test/test_ReduceScatterMultiProcess.cpp b/test/test_ReduceScatterMultiProcess.cpp
index 48c4b0cb9b..1f230c64a3 100644
--- a/test/test_ReduceScatterMultiProcess.cpp
+++ b/test/test_ReduceScatterMultiProcess.cpp
@@ -10,63 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestReduceScatter(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestReduceScatter(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestReduceScatter(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestReduceScatter(3, *dataset);
-                exit(0);
+                bool pass;
+                TestReduceScatter(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(3072, 3145728),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false, true),
                                 testing::Values("")),
diff --git a/test/test_ReduceScatterMultiProcess.hpp b/test/test_ReduceScatterMultiProcess.hpp
index a38ddf2148..2b57b42f63 100644
--- a/test/test_ReduceScatterMultiProcess.hpp
+++ b/test/test_ReduceScatterMultiProcess.hpp
@@ -13,60 +13,68 @@ namespace CorrectnessTests
     class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
     {
     public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const rank)
+        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
         {
             // Copy all inputs to expected arrays temporarily to perform reduction on host
-            HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
             barrier.Wait();
 
             // Have rank 0 do the expected calculation, then send results to other processes
-            int8_t*   resultI1;
-            if (rank == 0)
+            int8_t* resultI1;
+            for (int h = 0; h < ranks.size(); h++)
             {
-                // Allocate temporary host array to accumulate results
-                resultI1           = (int8_t   *)malloc(dataset.NumBytes());
-                uint8_t*  resultU1 = (uint8_t  *)resultI1;
-                int32_t*  resultI4 = (int32_t  *)resultI1;
-                uint32_t* resultU4 = (uint32_t *)resultI1;
-                int64_t*  resultI8 = (int64_t  *)resultI1;
-                uint64_t* resultU8 = (uint64_t *)resultI1;
-                float*    resultF4 = (float    *)resultI1;
-                double*   resultF8 = (double   *)resultI1;
-                rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-                // Initialize the result with the first device's array
-                memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-
-                // Perform reduction on the other device arrays
-                for (int i = 1; i < dataset.numDevices; i++)
+                int rank = ranks[h];
+                if (rank == 0)
                 {
-                    int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                    uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                    int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                    uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                    int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                    uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                    float*    arrayF4 = (float    *)arrayI1;
-                    double*   arrayF8 = (double   *)arrayI1;
-                    rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
+                    // Allocate temporary host array to accumulate results
+                    resultI1           = (int8_t   *)malloc(dataset.NumBytes());
+                    uint8_t*  resultU1 = (uint8_t  *)resultI1;
+                    int32_t*  resultI4 = (int32_t  *)resultI1;
+                    uint32_t* resultU4 = (uint32_t *)resultI1;
+                    int64_t*  resultI8 = (int64_t  *)resultI1;
+                    uint64_t* resultU8 = (uint64_t *)resultI1;
+                    float*    resultF4 = (float    *)resultI1;
+                    double*   resultF8 = (double   *)resultI1;
+                    rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
 
-                    for (int j = 0; j < dataset.numElements; j++)
+                    // Initialize the result with the first device's array
+                    memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+                    // Perform reduction on the other device arrays
+                    for (int i = 1; i < dataset.numDevices; i++)
                     {
-                        switch (dataset.dataType)
+                        int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                        uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                        int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                        uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                        int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                        uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                        float*    arrayF4 = (float    *)arrayI1;
+                        double*   arrayF8 = (double   *)arrayI1;
+                        rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
+
+                        for (int j = 0; j < dataset.numElements; j++)
                         {
-                        case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                        case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                        case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                        case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                        case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                        case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                        case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                        case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                        case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                        default:
-                            fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                            exit(0);
+                            switch (dataset.dataType)
+                            {
+                            case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                            case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                            case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                            case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                            case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                            case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                            case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                            case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                            case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
+                            default:
+                                fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                                exit(0);
+                            }
                         }
                     }
                 }
@@ -75,34 +83,44 @@ namespace CorrectnessTests
             // Copy results into expected arrays
             size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
 
-            HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
-
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
             barrier.Wait();
 
-            if (rank == 0)
+            for (int h = 0; h < ranks.size(); h++)
             {
-                for (int i = 0; i < dataset.numDevices; i++)
-                    memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
-                           resultI1 + (i * byteCount), byteCount);
+                int rank = ranks[h];
+                if (rank == 0)
+                {
+                    for (int i = 0; i < dataset.numDevices; i++)
+                        memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
+                               resultI1 + (i * byteCount), byteCount);
 
-                free(resultI1);
+                    free(resultI1);
+                }
             }
         }
 
-        void TestReduceScatter(int rank, Dataset& dataset)
+        void TestReduceScatter(int rank, Dataset& dataset, bool& pass)
         {
             // Prepare input / output / expected results
             SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
-            if (numElements % numDevices != 0) return;
+            if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
             // Prepare input / output / expected results
             FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, barrier, op, rank);
+            ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
 
             size_t const byteCount = dataset.NumBytes() / numDevices;
             size_t const recvCount = dataset.numElements / numDevices;
@@ -117,7 +135,7 @@ namespace CorrectnessTests
             HIP_CALL(hipStreamSynchronize(streams[rank]));
 
             // Check results
-            ValidateResults(dataset, rank);
+            pass = ValidateResults(dataset, rank);
 
             TearDownPerProcess(comms[rank], streams[rank]);
             dataset.Release(rank);
diff --git a/test/test_ScatterMultiProcess.cpp b/test/test_ScatterMultiProcess.cpp
index 4211ace652..0f12c84308 100644
--- a/test/test_ScatterMultiProcess.cpp
+++ b/test/test_ScatterMultiProcess.cpp
@@ -10,63 +10,27 @@ namespace CorrectnessTests
 {
     TEST_P(ScatterMultiProcessCorrectnessTest, Correctness)
     {
-        Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
         dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter);
-        Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
+        std::vector<int> pids(numDevices);
 
-        int pid1 = 0;
-        int pid2 = 0;
-        int pid3 = 0;
-        pid1 = fork();
-
-        // From this point on, ignore original process as we cannot have it create a HIP context
-        if (pid1 == 0)
+        int gpu = -1;
+        for (int i = 0; i < numDevices; i++)
         {
-            pid2 = fork();
-            if (numDevices > 2)
+            gpu++;
+            int pid = fork();
+            if (pid == 0)
             {
-                pid3 = fork();
-            }
-            if ((pid2 > 0 && pid3 == 0 && numDevices == 2)  || (pid2 > 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 0
-                TestScatter(0, *dataset);
-                if (pid3 > 0)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-            }
-            else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
-            {
-                // Process 1
-                TestScatter(1, *dataset);
-                if (numDevices > 2)
-                {
-                    waitpid(pid3, NULL, 0);
-                }
-                exit(0);
-            }
-            else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
-            {
-                // Process 2 (available when numDevices > 2)
-                TestScatter(2, *dataset);
-                exit(0);
-            }
-            else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
-            {
-                // Process 3 (available when numDevices == 4)
-                TestScatter(3, *dataset);
-                exit(0);
+                bool pass;
+                TestScatter(gpu, *dataset, pass);
+                TerminateChildProcess(pass);
             }
             else
             {
-                exit(0);
+                pids[gpu] = pid;
             }
-            waitpid(pid2, NULL, 0);
-            exit(0);
         }
-        waitpid(pid1, NULL, 0);
-        munmap(dataset, sizeof(Dataset));
+
+        ValidateProcesses(pids);
     }
 
     INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
                                 // Number of elements
                                 testing::Values(1024, 1048576),
                                 // Number of devices
-                                testing::Values(2,3,4),
+                                testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
                                 testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
diff --git a/test/test_ScatterMultiProcess.hpp b/test/test_ScatterMultiProcess.hpp
index d48040246d..0a14916c92 100644
--- a/test/test_ScatterMultiProcess.hpp
+++ b/test/test_ScatterMultiProcess.hpp
@@ -23,12 +23,16 @@ namespace CorrectnessTests
             }
         }
 
-        void TestScatter(int rank, Dataset& dataset)
+        void TestScatter(int rank, Dataset& dataset, bool& pass)
         {
             // Prepare input / output / expected results
             SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset);
 
-            if (numDevices > numDevicesAvailable) return;
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
 
             Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
 
@@ -50,7 +54,7 @@ namespace CorrectnessTests
                 HIP_CALL(hipStreamSynchronize(streams[rank]));
 
                 // Check results
-                ValidateResults(dataset, rank);
+                pass = ValidateResults(dataset, rank);
 
                 barrier.Wait();
             }

From 9cc3b5616601c9cc20ae7d93461e4416fa713cc4 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 9 Feb 2021 01:34:39 +0000
Subject: [PATCH 18/25] Fix GDRDMA read and remove unused files

---
 src/collectives/device/gather.h  | 75 --------------------------------
 src/collectives/device/scatter.h | 75 --------------------------------
 src/transport/net.cc             | 46 ++++++++++----------
 3 files changed, 23 insertions(+), 173 deletions(-)
 delete mode 100644 src/collectives/device/gather.h
 delete mode 100644 src/collectives/device/scatter.h

diff --git a/src/collectives/device/gather.h b/src/collectives/device/gather.h
deleted file mode 100644
index b6c3865875..0000000000
--- a/src/collectives/device/gather.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclGatherKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->coll.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->coll.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * GATHER_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-  const int root = args->coll.root;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
-        const T* sendbuff = thisInput+chunkOffset;
-        T* recvbuff = thisOutput+chunkOffset+rank*size;
-        if (tid < nthreads && sendbuff != recvbuff) {
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-        }
-      }
-      else {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        int peerNone = -1;
-        if (rank == root) {
-          ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
-            prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
-
-          ssize_t recv_offset = chunkOffset + peerRecv*size;
-          prims.recv(thisOutput+recv_offset, nelem);
-        }
-        else {
-          if (peerSend == root) {
-            ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
-              prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
-
-            ssize_t send_offset = chunkOffset;
-            prims.send(thisInput+send_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/collectives/device/scatter.h b/src/collectives/device/scatter.h
deleted file mode 100644
index 848b47713d..0000000000
--- a/src/collectives/device/scatter.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclScatterKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->coll.count;
-  const int nranks = comm->nRanks;
-  const int bid = args->coll.bid;
-  const int rank = ring->devUserRanks[0];
-  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * SCATTER_CHUNKSTEPS;
-  const int peersPerChan = DIVUP(nranks, nChannels);
-  const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-  const int root = args->coll.root;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-      ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-      int nelem = min(realChunkSize, size-chunkOffset);
-      if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
-        const T* sendbuff = thisInput+chunkOffset+rank*size;
-        T* recvbuff = thisOutput+chunkOffset;
-        if (tid < nthreads && sendbuff != recvbuff) {
-          // local copy
-          ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-        }
-      }
-      else {
-        int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-        int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-        int peerNone = -1;
-        if (rank == root) {
-          ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-            prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
-
-          ssize_t send_offset = chunkOffset + peerSend*size;
-          prims.send(thisInput+send_offset, nelem);
-        }
-        else {
-          if (peerRecv == root) {
-            ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-              prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
-
-            ssize_t recv_offset = chunkOffset;
-            prims.recv(thisOutput+recv_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 1f04d3ef13..84f3d3d4d1 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -101,29 +101,6 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     if (resources->buffSizes[LOC_HOSTMEM]) {
       NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
     }
-    if (resources->useGdr) {
-      //CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
-      struct data_struct {hsa_agent_t agent; int counter;} out;
-      out.counter = 0;
-      out.agent.handle = myInfo->cudaDev;
-      hsa_iterate_agents([](hsa_agent_t agent, void* data) {
-        int devId = ((struct data_struct *)data)->agent.handle;
-        hsa_device_type_t type;
-        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
-        if(type != HSA_DEVICE_TYPE_GPU)
-          return HSA_STATUS_SUCCESS;
-        if(((struct data_struct *)data)->counter!=devId) {
-          ((struct data_struct *)data)->counter++;
-          return HSA_STATUS_SUCCESS;
-        }
-        ((struct data_struct *)data)->agent = agent;
-        return HSA_STATUS_SUCCESS;
-      }, (void*)&out);
-      hsa_amd_hdp_flush_t hdpinfo;
-      hsa_status_t err = hsa_agent_get_info(out.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
-      resources->curr_hdp_reg = hdpinfo.HDP_MEM_FLUSH_CNTL;
-      send->conn.curr_hdp_reg = resources->curr_hdp_reg;
-    }
 
     int offsets[LOC_COUNT];
     offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
@@ -134,6 +111,29 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     }
   }
 
+  if (resources->useGdr) {
+    //CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
+    struct data_struct {hsa_agent_t agent; int counter;} out;
+    out.counter = 0;
+    out.agent.handle = myInfo->cudaDev;
+    hsa_iterate_agents([](hsa_agent_t agent, void* data) {
+      int devId = ((struct data_struct *)data)->agent.handle;
+      hsa_device_type_t type;
+      hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
+      if(type != HSA_DEVICE_TYPE_GPU)
+        return HSA_STATUS_SUCCESS;
+      if(((struct data_struct *)data)->counter!=devId) {
+        ((struct data_struct *)data)->counter++;
+        return HSA_STATUS_SUCCESS;
+      }
+      ((struct data_struct *)data)->agent = agent;
+      return HSA_STATUS_SUCCESS;
+    }, (void*)&out);
+    hsa_amd_hdp_flush_t hdpinfo;
+    hsa_status_t err = hsa_agent_get_info(out.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
+    resources->curr_hdp_reg = hdpinfo.HDP_MEM_FLUSH_CNTL;
+    send->conn.curr_hdp_reg = resources->curr_hdp_reg;
+  }
   INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
       resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
   return ncclSuccess;

From f1a9ce3fa5cee626c3ff97301e3dbdfd47ea9be1 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Tue, 9 Feb 2021 03:53:03 +0000
Subject: [PATCH 19/25] Using GTEST_SKIP() to skip unit tests that have
 insufficient devices.  Skipping out earlier

---
 test/CorrectnessTest.hpp | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp
index fd7187a810..15cf6bcf20 100644
--- a/test/CorrectnessTest.hpp
+++ b/test/CorrectnessTest.hpp
@@ -456,6 +456,16 @@ namespace CorrectnessTests
             // Make the test tuple parameters accessible
             std::tie(op, dataType, numElements, numDevices, inPlace, envVals) = GetParam();
 
+            // Collect the number of available GPUs
+            HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
+
+            // Only proceed with testing if there are enough GPUs
+            if (numDevices > numDevicesAvailable)
+            {
+              GTEST_SKIP();
+              return;
+            }
+
             envString = 0;
             numTokens = 0;
             if (strcmp(envVals, "")) {
@@ -477,22 +487,6 @@ namespace CorrectnessTests
                 }
             }
 
-            // Collect the number of available GPUs
-            HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
-
-            // Only proceed with testing if there are enough GPUs
-            if (numDevices > numDevicesAvailable)
-            {
-                fprintf(stdout, "[  SKIPPED ] Test requires %d devices (only %d available)\n",
-                        numDevices, numDevicesAvailable);
-
-                // Modify the number of devices so that tear-down doesn't occur
-                // This is temporary until GTEST_SKIP() becomes available
-                numDevices = 0;
-                numDevicesAvailable = -1;
-                return;
-            }
-
             // Initialize communicators
             comms.resize(numDevices);
             NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
@@ -509,6 +503,8 @@ namespace CorrectnessTests
         // Clean up per TestTuple
         void TearDown() override
         {
+            if (IsSkipped()) return;
+
             // Release communicators and streams
             for (int i = 0; i < numDevices; i++)
             {

From 911d61f214d45c98df1ee8c0ac23c33fb94b63de Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 9 Feb 2021 15:34:08 -0800
Subject: [PATCH 20/25] 2.8.4-1

Fix hang in corner cases of alltoallv using point to point send/recv.
Harmonize error messages.
Fix missing NVTX section in the license.
Update README.
---
 LICENSE.txt                 | 10 +++++++++-
 README.md                   | 22 +++-------------------
 makefiles/version.mk        |  2 +-
 src/bootstrap.cc            |  6 +++---
 src/enqueue.cc              |  2 +-
 src/graph/connect.cc        |  6 +++---
 src/graph/paths.cc          |  2 +-
 src/graph/search.cc         |  8 ++++----
 src/graph/topo.cc           |  4 ++--
 src/graph/xml.cc            | 12 ++++++------
 src/group.cc                |  8 ++++++--
 src/include/debug.h         |  2 +-
 src/include/shm.h           |  2 +-
 src/include/socket.h        |  2 +-
 src/init.cc                 |  4 ++--
 src/misc/utils.cc           |  4 ++--
 src/proxy.cc                | 10 +++++-----
 src/transport/net_ib.cc     |  2 +-
 src/transport/net_socket.cc |  2 +-
 19 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index e318c66695..bcd1867a02 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
@@ -29,3 +29,11 @@
  The U.S. Department of Energy funded the development of this software
  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
 
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
diff --git a/README.md b/README.md
index 7f0a72f5df..a0ad9e2637 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,13 @@
 # NCCL
 
-Optimized primitives for collective multi-GPU communication.
+Optimized primitives for inter-GPU communication.
 
 ## Introduction
 
-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
 
 For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
 
-## What's inside
-
-At present, the library implements the following collectives operations:
-
-- all-reduce
-- all-gather
-- reduce-scatter
-- reduce
-- broadcast
-
-These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
-
-## Requirements
-
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
 ## Build
 
 Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
@@ -89,4 +73,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
 
 ## Copyright
 
-All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
diff --git a/makefiles/version.mk b/makefiles/version.mk
index f2539c595b..f64e8ad49a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 8
-NCCL_PATCH   := 3
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index bd6ec99c6c..6f682f64a1 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -72,7 +72,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
   int recvSize;
   NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
   if (recvSize > size) {
-    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
     return ncclInternalError;
   }
   NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
@@ -244,7 +244,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
 void* ncclRemoteMemAllocationService(void* args) {
   struct remAllocState* state = (struct remAllocState *) args;
   if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
-    WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
+    WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
   }
 
   // Prepare poll descriptor
@@ -490,7 +490,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
 ncclResult_t bootstrapClose(void* commState) {
   struct extState* state = (struct extState*)commState;
   if (state->unexpectedConnections != NULL) {
-    WARN("Unexpected connections are not empty.\n");
+    WARN("Unexpected connections are not empty");
     return ncclInternalError;
   }
   close(state->extListenFd);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index a427bd982e..4137f61047 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -350,7 +350,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
     case ncclPatternRingTwice:
       info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
     default:
-      WARN("Unknown pattern %d\n", info->pattern);
+      WARN("Unknown pattern %d", info->pattern);
       return ncclInternalError;
   }
   return ncclSuccess;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index a0f1265c84..a64f9bef25 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -115,7 +115,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
   int x = 0;
   while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
   if (x == NCCL_MAX_TREE_ARITY) {
-    WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
+    WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
     return ncclInternalError;
   }
   tree->down[x] = indexes[d];
@@ -213,7 +213,7 @@ int ncclMinNchannels() {
   if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
   if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
   if (minNchannels > MAXCHANNELS) {
-    WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+    WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
     minNchannels = MAXCHANNELS;
   }
   if (minNchannels < 0) minNchannels = 0;
@@ -225,7 +225,7 @@ int ncclMaxNchannels() {
   if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
   if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
   if (maxNchannels < 1) {
-    WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+    WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
     maxNchannels = 1;
   }
   return maxNchannels;
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index eb556c408b..f4e331b542 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -25,7 +25,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
       return ncclSuccess;
     }
   }
-  WARN("Could not find node of type %d id %lx\n", t, id);
+  WARN("Could not find node of type %d id %lx", t, id);
   return ncclInternalError;
 }
 
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 57c66e7664..6e9a208037 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -55,7 +55,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
       return ncclSuccess;
     }
   }
-  WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+  WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
   return ncclInternalError;
 }
 
@@ -188,7 +188,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
       return ncclSuccess;
     }
   }
-  WARN("Could not find gpu rank %d\n", rank);
+  WARN("Could not find gpu rank %d", rank);
   return ncclInternalError;
 }
 
@@ -199,7 +199,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
       return ncclSuccess;
     }
   }
-  WARN("Could not find net id %lx\n", id);
+  WARN("Could not find net id %lx", id);
   return ncclInternalError;
 }
 
@@ -788,7 +788,7 @@ done:
   }
 
   if (graph->nChannels == 0 && graph->collNet == 0) {
-    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+    WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
     for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
     graph->speedIntra = graph->speedInter = 0.1;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index d53b532ed5..b5c5cc83af 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -97,7 +97,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
 
 ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
   if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
-    WARN("Error : tried to create too many nodes of type %d\n", type);
+    WARN("Error : tried to create too many nodes of type %d", type);
     return ncclInternalError;
   }
   struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@@ -421,7 +421,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
     NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
     NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
     if (gpu == NULL) {
-      WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+      WARN("Add NVLink error : could not find GPU %lx", pBusId);
       return ncclInternalError;
     }
     int count;
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index b2232c2b66..a12865ecd6 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -62,7 +62,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
     if (c == '=') {
       ptr[o] = '\0';
       if (value == NULL) {
-        WARN("XML Parse : Unexpected value with name %s\n", ptr);
+        WARN("XML Parse : Unexpected value with name %s", ptr);
         return ncclInternalError;
       }
       return xmlGetValue(file, value, last);
@@ -128,7 +128,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
     // Re-read the name, we got '/' in the first call
     NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
     if (c != '>') {
-      WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+      WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
       return ncclInternalError;
     }
     return ncclSuccess;
@@ -141,7 +141,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
   while (c == ' ') {
     NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
     if (a == MAX_ATTR_COUNT) {
-      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
       // Actually we need to still consume the extra attributes so we have an extra one.
     } else a++;
   }
@@ -169,7 +169,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
   if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
   while (1) {
     if (xml->maxIndex == MAX_NODES) {
-      WARN("Error : XML parser is limited to 1024 nodes\n");
+      WARN("Error : XML parser is limited to 1024 nodes");
       return ncclInternalError;
     }
     struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@@ -360,7 +360,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
   char strValue[MAX_STR_LEN];
   NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
   if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
-  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
   return ncclSuccess;
 }
 
@@ -592,7 +592,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
     int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
 
     if (maxNvLinks > 0 && nvmlDev == NULL) {
-      WARN("No NVML device handle. Skipping nvlink detection.\n");
+      WARN("No NVML device handle. Skipping nvlink detection.");
       maxNvLinks = 0;
     }
 
diff --git a/src/group.cc b/src/group.cc
index 78a74b65c1..43a93284f2 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -201,7 +201,7 @@ ncclResult_t ncclGroupEnd() {
     if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
       int err = pthread_join(ncclGroupThreads[i], NULL);
       if (err != 0) {
-        WARN("Error waiting for pthread_join : %s\n", strerror(errno));
+        WARN("Error waiting for pthread_join : %s", strerror(errno));
         return ncclSystemError;
       }
       NCCLCHECKGOTO(args->ret, ret, end);
@@ -233,7 +233,7 @@ ncclResult_t ncclGroupEnd() {
         // schedule delta 0, +1, -1, +2, -2, ...
         // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
         for (int d=0; d<=nRanks/4; d++) {
-          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
+          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
           int index = 0;
           int delta = deltas[index];
 sched_delta:
@@ -258,6 +258,10 @@ sched_delta:
               ssize_t sendbytes = totSendBytes-sendOffset;
               if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
               if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+              // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
+              // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
+              if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
+              if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
               if (sendbytes >= 0 || recvbytes >= 0) {
                 NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
                       recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
diff --git a/src/include/debug.h b/src/include/debug.h
index d88458c24a..e7a152cc97 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
 
 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
diff --git a/src/include/shm.h b/src/include/shm.h
index 9cd9d053e3..7334f162a2 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -45,7 +45,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
   *shmPtr = ptr;
   return ncclSuccess;
 sysError:
-  WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
+  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
 cudaError:
   if (fd != -1) close(fd);
   if (create) shm_unlink(shmname);
diff --git a/src/include/socket.h b/src/include/socket.h
index e903b04280..8b59f72218 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -367,7 +367,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
   /* IPv4/IPv6 support */
   int family = remoteAddr->sa.sa_family;
   if (family != AF_INET && family != AF_INET6) {
-    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
+    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
     return ncclInternalError;
   }
   int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
diff --git a/src/init.cc b/src/init.cc
index 81831cf5dc..08bddfd23b 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -581,7 +581,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   for (int i = 0; i < nranks; i++) {
     memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
     if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
       return ncclInvalidUsage;
     }
   }
@@ -878,7 +878,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
 
   return ncclSuccess;
 cleanup:
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index b231eb19a1..79e61704dd 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -67,10 +67,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
 }
 
 uint64_t getHash(const char* string, int n) {
-  // Based on DJB2, result = result * 33 + char
+  // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
   for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) + string[c];
+    result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }
diff --git a/src/proxy.cc b/src/proxy.cc
index d3824f2346..503781ec7b 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   struct ncclProxyArgs* op = state->ops;
   while (op) {
     if (op->idle & OP_SEEN) {
-      WARN("Active list loop at element %ld\n", OP_INDEX(op));
+      WARN("Active list loop at element %ld", OP_INDEX(op));
     }
     op->idle |= OP_SEEN;
     printf("[%ld]", OP_INDEX(op));
@@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   struct ncclProxyArgs* free = state->pool;
   while (free) {
     if (free->idle & OP_SEEN) {
-      WARN("Free list loop at element %ld\n", OP_INDEX(free));
+      WARN("Free list loop at element %ld", OP_INDEX(free));
     }
     free->idle |= OP_SEEN;
     free = free->next;
@@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   while (p) {
     for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
       if ((p->elems[e].idle & OP_SEEN) == 0) {
-        WARN("Element %d of pool %d has been lost\n", e, i);
+        WARN("Element %d of pool %d has been lost", e, i);
         struct ncclProxyArgs* free = state->pool;
         printf("Free list ");
         while (free) {
@@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
   struct ncclPeer* peerComm = args->channel->peers+peer;
   struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
   if (connector->transportComm == NULL) {
-    WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
+    WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
         type == proxyRecv ? "recv" : "send", peer, args->channel->id);
     return ncclInternalError;
   }
@@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
   while (nslots*state->slotSize < size) nslots *= 2;
   int s = (ptr-buff)/state->slotSize;
   if (s < 0 || s+nslots > state->nslots) {
-    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
+    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
     return ncclInternalError;
   }
   for (int i=0; i<nslots; i++) used[s+i] = 0;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index f8242a539c..4ed46eb5eb 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -84,7 +84,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
   char* p = realpath(devicePath, NULL);
   if (p == NULL) {
-    WARN("Could not find real path of %s", *devicePath);
+    WARN("Could not find real path of %s (%s)", devName, devicePath);
   } else {
     // Merge multi-port NICs into the same PCI device
     p[strlen(p)-1] = '0';
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 272d8cdc5c..268fc76d5d 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -237,7 +237,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
     if (fd == -1) {
       // Could not find device vendor. This is handled silently so
       // we don't want to print an INFO error.
-      TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+      TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
       goto end;
     }
     char vendor[7];

From bf8eb407055e71f71eec81e9dfd0a5b2614817b0 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 12 Feb 2021 18:06:19 +0000
Subject: [PATCH 21/25] Move HDP flush to CPU

---
 src/collectives/device/primitives.h | 1 -
 src/include/proxy.h                 | 1 +
 src/transport/net.cc                | 7 ++++++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index a5e52aca7a..5ffb02601c 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -158,7 +158,6 @@ class ncclPrimitives {
 
   inline __device__ void postSend() {
     if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
-    if (conn->curr_hdp_reg) STORE(conn->curr_hdp_reg, 0x1);
     STORE(connTailPtr, step += SLICESTEPS);
   }
 
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 9796baf39a..2e7773e964 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -36,6 +36,7 @@ struct ncclProxyArgs {
   uint64_t transmitted;
   uint64_t done;
   uint64_t end;
+  uint64_t hdp_flushed;
   void* requests[NCCL_STEPS];
   int idle;
 
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 84f3d3d4d1..be658ec4d6 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -283,7 +283,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     // Round to next multiple of sliceSteps
     resources->step = ROUNDUP(resources->step, args->chunkSteps);
-    args->posted = args->transmitted = args->done = resources->step;
+    args->posted = args->transmitted = args->done = args->hdp_flushed = resources->step;
     args->end = resources->step + args->nsteps;
     args->state = ncclProxyOpProgress;
   }
@@ -345,6 +345,11 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
           }
         }
         if (ready) {
+          // flush HDP if not done
+          if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
+            args->hdp_flushed = LOAD(recvTail);
+            STORE(resources->curr_hdp_reg, 1);
+          }
           // Data is ready, try to send.
           NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot));
           if (args->requests[buffSlot] != NULL) {

From 95f178324c6a3d69edac13873f613251cd774e17 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 18 Feb 2021 00:44:15 +0000
Subject: [PATCH 22/25] Add support to another Rome model

---
 src/graph/rome_models.h   | 28 ++++++++++++++++++--
 src/graph/search.cc       | 55 +++++++++++++++++++++------------------
 tools/topo_expl/utils.cpp |  2 +-
 3 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/src/graph/rome_models.h b/src/graph/rome_models.h
index 20d08fe5b5..21b089c0b4 100755
--- a/src/graph/rome_models.h
+++ b/src/graph/rome_models.h
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#define MAX_ROME_GPUS 8
-#define MAX_ROME_NICS 2
+#define MAX_ROME_GPUS 16
+#define MAX_ROME_NICS 8
 
 struct rcclRomeModel {
   int nGpus;
@@ -235,6 +235,28 @@ static struct rcclRomeModel rome_model_46 = {
   .ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
 };
 
+static struct rcclRomeModel rome_model_48 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
+  .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
+  .nicIds = { },
+  .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
+  .nicNuma = { },
+  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
+  .pattern = "20202020",
+  .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
+};
+
+static struct rcclRomeModel rome_model_49 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
+  .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
+  .nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
+  .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
+  .nicNuma = { 0, 1, 2, 3, },
+  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
+  .pattern = "21212121",
+  .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0",
+};
+
 static struct rcclRomeModel romeTopoModels[] = {
   rome_model_22,
   rome_model_25,
@@ -254,4 +276,6 @@ static struct rcclRomeModel romeTopoModels[] = {
   rome_model_44,
   rome_model_45,
   rome_model_46,
+  rome_model_48,
+  rome_model_49,
 };
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 097dcaaeb4..65eb5040f3 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -864,17 +864,6 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclT
   return ncclSuccess;
 }
 
-static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
-  *g = 0; *n = 0;
-  int i;
-  if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
-  for (int j = 0; j < nnet; j++)
-    if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
-  for (int j = 0; j < system->nodes[GPU].count; j++)
-    if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
-  return true;
-}
-
 static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
   *index = -1;
   for (int i=0; i<system->nodes[GPU].count; i++) {
@@ -893,12 +882,18 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
   romeTopo->nNics = 0;
   romeTopo->nLinks = 0;
   for (int i = 0; i < romeTopo->nGpus; i ++) {
-    int gpu, n;
+    int gpu, n, m, distance;
     NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
     romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
-    for (n = 0; n < romeTopo->nCpus; n++)
-      if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
-    if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
+    m = 0;
+    distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
+    for (n = 1; n < romeTopo->nCpus; n++) {
+      if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
+        distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
+        m = n;
+      }
+    }
+    if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
 
     struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
     if (node->paths[GPU] == NULL) continue;
@@ -936,24 +931,32 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
     }
   }
 
+  for (int i = 0; i < romeTopo->nNics; i ++) {
+    int net, n, m, distance;
+    NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
+    m = 0;
+    distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
+    for (n = 0; n < romeTopo->nCpus; n++)
+      if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
+        distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
+        m = n;
+      }
+    if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
+    else return ncclSuccess;
+  }
+
   // number of GPUs and NICs on each numa node is used as first screening pattern
   for (int i = 0; i < romeTopo->nCpus; i++) {
-    int g, n;
-    getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map);
+    int g = 0, n = 0;
+    for (int j = 0; j < romeTopo->nGpus; j++)
+      if (romeTopo->gpuNuma[j] == i) g++;
+    for (int j = 0; j < romeTopo->nNics; j++)
+      if (romeTopo->nicNuma[j] == i) n++;
     pattern[i*2] = '0' + g;
     pattern[i*2+1] = '0' + n;
   }
   pattern[romeTopo->nCpus*2] = 0;
 
-  for (int i = 0; i < romeTopo->nNics; i ++) {
-    int net, n;
-    NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
-    for (n = 0; n < romeTopo->nCpus; n++)
-      if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
-    if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
-    else return ncclSuccess;
-  }
-
   const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
   if (romeModelFile) {
     INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index e5bcf768fd..3d2a70dbbc 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -163,7 +163,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   for (int i = 0; i < nranks; i++) {
     memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
     if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
       return ncclInvalidUsage;
     }
   }

From 8aea5edb29e9e1264233067d9579e814905cc8fc Mon Sep 17 00:00:00 2001
From: Cory Bloor <Cordell.Bloor@amd.com>
Date: Thu, 18 Feb 2021 16:37:37 -0700
Subject: [PATCH 23/25] Add Jenkins docs build (#18)

* Fix typo in copyright

* Minor README improvements

- Prevent underscores from being interpreted as italics in test name format.
- Switch URL to HTTPS.

* Update docs scripts config

- Allow run_doc.sh and run_doxygen.sh to be called from any directory.

* Add docs build to Jenkins
---
 .jenkins/staticanalysis.groovy | 68 ++++++++++++++++++++++++++++++++++
 README.md                      |  5 ++-
 docs/run_doc.sh                | 18 ++++-----
 docs/run_doxygen.sh            | 18 +++++----
 docs/source/conf.py            |  6 +--
 5 files changed, 93 insertions(+), 22 deletions(-)
 create mode 100644 .jenkins/staticanalysis.groovy

diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
new file mode 100644
index 0000000000..adc9491910
--- /dev/null
+++ b/.jenkins/staticanalysis.groovy
@@ -0,0 +1,68 @@
+#!/usr/bin/env groovy
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCompileCommand(platform, project, jobName, boolean debug=false)
+{
+    project.paths.construct_build_prefix()
+    
+    def command = """#!/usr/bin/env bash
+            set -x
+            ${project.paths.project_build_prefix}/docs/run_doc.sh
+            """
+
+    try
+    {
+        platform.runCommand(this, command)
+    }
+    catch(e)
+    {
+        throw e
+    }
+
+    publishHTML([allowMissing: false,
+                alwaysLinkToLastBuild: false,
+                keepAll: false,
+                reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html",
+                reportFiles: "index.html",
+                reportName: "Documentation",
+                reportTitles: "Documentation"])
+}
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl-internal', 'StaticAnalysis')
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+    boolean staticAnalysis = true
+
+    def compileCommand =
+    {
+        platform, project->
+
+        runCompileCommand(platform, project, jobName, false)
+    }
+
+    buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
+    stage(urlJobName) {
+        runCI([ubuntu18:['any']], urlJobName)
+    }
+}
diff --git a/README.md b/README.md
index 433387fe85..86ee8501a2 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,8 @@ There are unit tests implemented with the Googletest framework in RCCL, which ar
 To invoke the unit tests, go to the build folder, then the test subfolder, and execute the appropriate unit test executable(s).
 
 Unit test names are now of the format:
-[CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
+
+    [CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
 
 This allows filtering of unit tests being run by their parameter values by passing the --gtest_filter command line flag, for example:
 
@@ -82,7 +83,7 @@ See the rccl-tests README for more information on how to build and run those tes
 
 ## Library and API Documentation
 
-Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation.
+Please refer to the [Library documentation](https://rccl.readthedocs.io/) for current documentation.
 
 ## Copyright
 
diff --git a/docs/run_doc.sh b/docs/run_doc.sh
index 826b8ea2ec..4de10d87c3 100755
--- a/docs/run_doc.sh
+++ b/docs/run_doc.sh
@@ -1,17 +1,15 @@
 #!/bin/bash
-# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 
-if [ -d docBin ]; then
-    rm -rf docBin
-fi
+set -eu
 
-sed -e 's/ROCFFT_EXPORT //g' ../library/include/rocfft.h > rocfft.h
-doxygen Doxyfile
+# Make this directory the PWD
+cd "$(dirname "${BASH_SOURCE[0]}")"
 
+# Build doxygen info
+./run_doxygen.sh
+
+# Build sphinx docs
 cd source
 make clean
 make html
-cd ..
-
-rm rocfft.h
-
diff --git a/docs/run_doxygen.sh b/docs/run_doxygen.sh
index 4424e76b82..a3cc4361f5 100755
--- a/docs/run_doxygen.sh
+++ b/docs/run_doxygen.sh
@@ -1,13 +1,17 @@
 #!/bin/bash
-# # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 
-if [ -d docBin ]; then
-    rm -rf docBin
-fi
+set -eu
 
-rm nccl.h
+# Make this directory the PWD
+cd "$(dirname "${BASH_SOURCE[0]}")"
 
-sed -e 's/ROCFFT_EXPORT //g' ../src/nccl.h.in > nccl.h
+# Rename our input file
+cp ../src/nccl.h.in nccl.h
+
+# Build the doxygen info
+rm -rf docBin
 doxygen Doxyfile
-#rm nccl.h
 
+# Cleanup
+rm nccl.h
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3ec199e2c8..e1b437dc20 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -56,8 +56,8 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'RCCL'
-copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Mirco Devices'
-author = u'Advanced Mirco Devices'
+copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Micro Devices'
+author = u'Advanced Micro Devices'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -156,7 +156,7 @@ latex_elements = {
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
     (master_doc, 'RCCL.tex', u'RCCL Documentation',
-     u'Advanced Mirco Devices', 'manual'),
+     u'Advanced Micro Devices', 'manual'),
 ]
 
 

From 6a568a8652d138c8716c2844902fd192fdafb6d5 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Fri, 19 Feb 2021 21:37:51 +0000
Subject: [PATCH 24/25] Make CI only run subset of unit tests.

---
 .jenkins/common.groovy | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 8e31847c2e..037c63a0c0 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -21,7 +21,8 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}/build/release/test
-                ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
+                ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
+                ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
             """
 
    platform.runCommand(this, command)

From 45f5255f7ceb7635ec5767647ed8ab7864ea6cc1 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Fri, 19 Feb 2021 22:22:46 +0000
Subject: [PATCH 25/25] Fixing cache deletion for CliqueManager; updating
 copyright

---
 docs/source/conf.py         | 2 +-
 src/clique/CliqueManager.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index e1b437dc20..fe3b05c3dd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -56,7 +56,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'RCCL'
-copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Micro Devices'
+copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2021 Advanced Micro Devices'
 author = u'Advanced Micro Devices'
 
 # The version info for the project you're documenting, acts as replacement for
diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc
index 06dc8f539c..93c8eb091c 100644
--- a/src/clique/CliqueManager.cc
+++ b/src/clique/CliqueManager.cc
@@ -82,7 +82,7 @@ void CliqueManager::CleanUp()
   {
     // Release caches
     if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
-    if (m_ipcHandleSendCache) delete m_ipcHandleRecvCache;
+    if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;
 
     // Close shared memory
     m_shmHandles.Close();