diff --git a/CMakeLists.txt b/CMakeLists.txt
index 945641c65b..89bd866ad5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,18 +173,22 @@ set(CC_SOURCES
     src/collectives/all_to_all_api.cc
     src/collectives/all_to_allv_api.cc
     src/channel.cc
-    src/clique/CliqueManager.cc     # RCCL
-    src/clique/HandleCache.cc       # RCCL
-    src/clique/HandleShm.cc         # RCCL
-    src/clique/Hash.cc              # RCCL
-    src/clique/MsgQueue.cc          # RCCL
-    src/clique/ShmObject.cc         # RCCL
+    #src/clique/CliqueManager.cc     # RCCL
+    #src/clique/HandleCache.cc       # RCCL
+    #src/clique/HandleShm.cc         # RCCL
+    #src/clique/Hash.cc              # RCCL
+    #src/clique/MsgQueue.cc          # RCCL
+    #src/clique/ShmObject.cc         # RCCL
     src/misc/argcheck.cc
     src/misc/nvmlwrap_stub.cc
     src/misc/utils.cc
     src/misc/ibvwrap.cc
     src/misc/nvmlwrap_stub.cc
     src/misc/rocm_smi_wrap.cc
+    src/misc/profiler.cc
+    src/misc/shmutils.cc
+    src/misc/socket.cc
+    src/misc/param.cc
     src/transport/coll_net.cc
     src/transport/net.cc
     src/transport/net_ib.cc
@@ -196,6 +200,7 @@ set(CC_SOURCES
     src/group.cc
     src/bootstrap.cc
     src/proxy.cc
+    src/net.cc
     src/enqueue.cc)
 
 foreach(filename ${CC_SOURCES})
@@ -212,11 +217,6 @@ if(PROFILE)
   add_definitions(-DENABLE_PROFILING)
 endif()
 
-if(TIMING_PROFILE)
-  add_definitions(-DENABLE_PROFILING)
-  add_definitions(-DENABLE_TIMING_PROFILE)
-endif()
-
 set(COLLTRACE 1 CACHE BOOL "Collective Trace Option")
 if(COLLTRACE)
   add_definitions(-DENABLE_COLLTRACE)
diff --git a/ext-net/google-fastsocket/Makefile b/ext-net/google-fastsocket/Makefile
index e40e3053ad..8dfa8ca4ae 100644
--- a/ext-net/google-fastsocket/Makefile
+++ b/ext-net/google-fastsocket/Makefile
@@ -1,10 +1,10 @@
 CUDA_HOME?=/usr/local/cuda
 INC:=-I$(CUDA_HOME)/include
-PLUGIN_SO:=../../build/libnccl-net.so
+PLUGIN_SO:=libnccl-net.so
 
 default: $(PLUGIN_SO)
 
-$(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc
+$(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc nccl-fastsocket/utilities.cc
 	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 
 nccl-fastsocket/%.cc:
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 64f8d2dc6e..1a1c2b66f8 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -23,7 +23,6 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
 
-
 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
 CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
@@ -39,7 +38,7 @@ CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 
 # Include Ampere support if we're using CUDA11 or above
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
 # Include Volta support if we're using CUDA9 or above
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 22bddcee2e..7c9bf0f136 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 11
-NCCL_PATCH   := 4
+NCCL_MINOR   := 12
+NCCL_PATCH   := 10
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index a548840b3d..82e21a04ea 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
-		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
+		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                 collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                 graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -74,14 +74,14 @@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
+null :=
+space := $(null) #
+comma := ,
+
 $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(eval TMP := $(shell mktemp -d))
-	cp $(LIBOBJ) $(TMP)
-	cd $(TMP) && ar x $(DEVICELIB) && cd -
-	ar cr $@ $(LIBOBJ) $(TMP)/*.o
-	rm -Rf $(TMP)
+	printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
 
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
@@ -121,7 +121,7 @@ clean :
 	$(MAKE) -C collectives/device clean
 	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 
-install : lib
+install : build
 	mkdir -p $(PREFIX)/lib
 	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index b38f8be0bb..daaa8cdbb7 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,18 +9,13 @@
 #include "utils.h"
 #include "bootstrap.h"
 #include "net.h"
-#include "socket.h"
 #include <unistd.h>
 #include <sys/types.h>
-// [RCCL]
-#include "clique/CliqueManager.h"
-#include "clique/CliqueShmNames.h"
-#include "clique/Hash.h"
-// [/RCCL]
+#include "proxy.h"
 
 /* Init functions */
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress bootstrapNetIfAddr;
+static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -31,17 +25,17 @@ ncclResult_t bootstrapNetInit() {
     if (bootstrapNetInitDone == 0) {
       char* env = getenv("NCCL_COMM_ID");
       if (env) {
-        union socketAddress remoteAddr;
-        if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+        union ncclSocketAddress remoteAddr;
+        if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
           WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
           return ncclInvalidArgument;
         }
-        if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+        if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
           WARN("NET/Socket : No usable listening interface found");
           return ncclSystemError;
         }
       } else {
-        int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+        int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
           return ncclInternalError;
@@ -49,7 +43,7 @@ ncclResult_t bootstrapNetInit() {
       }
       char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
       sprintf(line, " %s:", bootstrapNetIfName);
-      socketToString(&bootstrapNetIfAddr, line+strlen(line));
+      ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
       INFO(NCCL_INIT, "Bootstrap : Using%s", line);
       bootstrapNetInitDone = 1;
     }
@@ -61,35 +55,28 @@ ncclResult_t bootstrapNetInit() {
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
 
-static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd, union socketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
-  socklen_t socklen = sizeof(union socketAddress);
-  SYSCHECKVAL(accept(listenFd, saddr, &socklen), "accept", *recvFd);
-  return ncclSuccess;
-}
-
 // Additional sync functions
-static ncclResult_t bootstrapNetSend(int fd, union socketAddress *addr, void* data, int size) {
-  NCCLCHECK(socketSend(fd, addr, &size, sizeof(int)));
-  NCCLCHECK(socketSend(fd, addr, data, size));
+static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) {
+  NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, data, size));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapNetRecv(int fd, union socketAddress *addr, void* data, int size) {
+static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) {
   int recvSize;
-  NCCLCHECK(socketRecv(fd, addr, &recvSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int)));
   if (recvSize > size) {
     WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
     return ncclInternalError;
   }
-  NCCLCHECK(socketRecv(fd, addr, data, std::min(recvSize, size)));
+  NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
   return ncclSuccess;
 }
 
 struct extInfo {
   int rank;
   int nranks;
-  union socketAddress extAddressListenRoot;
-  union socketAddress extAddressListen;
+  union ncclSocketAddress extAddressListenRoot;
+  union ncclSocketAddress extAddressListen;
 };
 
 #include <sys/resource.h>
@@ -102,33 +89,25 @@ static ncclResult_t setFilesLimit() {
   return ncclSuccess;
 }
 
-static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
-
-  // [RCCL] Unpack bootstrapRootStruct
-  struct bootstrapRootStruct rootStruct = *(struct bootstrapRootStruct*)bootstrapRootStruct;
-  int listenFd = rootStruct.listenFd;
-  unsigned long hash = rootStruct.hash;
-  int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
-  free(bootstrapRootStruct);
-  // [/RCCL]
-
+static void *bootstrapRoot(void* args) {
+  struct ncclSocket* listenSock = (struct ncclSocket*)args;
   ncclResult_t res = ncclSuccess;
   int nranks = 0, c = 0;
   struct extInfo info;
-  union socketAddress *rankAddresses = NULL;
-  union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
-  union socketAddress *zero = NULL;
+  union ncclSocketAddress *rankAddresses = NULL;
+  union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  union ncclSocketAddress *zero = NULL;
   NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
   setFilesLimit();
 
   TRACE(NCCL_INIT, "BEGIN");
   /* Receive addresses from all ranks */
   do {
-    int tmpFd;
-    union socketAddress addr;
-    NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd, &addr), res, out);
-    NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &addr, &info, sizeof(info)), res, out);
-    close(tmpFd);
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
+    close(sock.fd);
 
     if (c == 0) {
       nranks = info.nranks;
@@ -141,40 +120,35 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in
       goto out;
     }
 
-    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
+    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
 
     // Save the connection handle for that rank
-    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
-    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
+    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress));
+    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress));
 
     ++c;
     TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
   } while (c < nranks);
   TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
 
-  { // [RCCL] Initialize message queues / shared memory files
-    NCCLCHECKGOTO(CliqueManager::BootstrapRootInit(pid, hash), res, out);
-  } // [/RCCL]
-
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
-
-    int tmpSendFd;
-    NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
-    NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddressesRoot+r, rankAddresses+next, sizeof(union socketAddress)), res, out);
-    { // [RCCL] Send the root pid for shared file naming
-      NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddressesRoot+r, &pid, sizeof(int)), res, out);
-    } // [/RCCL]
-    close(tmpSendFd);
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
+    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
+    close(sock.fd);
   }
   TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
 
 out:
-  close(listenFd);
+  close(listenSock->fd);
+  free(listenSock);
   if (rankAddresses) free(rankAddresses);
   if (rankAddressesRoot) free(rankAddressesRoot);
   if (zero) free(zero);
@@ -184,36 +158,32 @@ out:
 }
 
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
-  union socketAddress* connectAddr = (union socketAddress*) id;
-  int listenFd;
-  NCCLCHECK(createListenSocket(&listenFd, connectAddr));
+  struct ncclSocket* listenSock;
+  NCCLCHECK(ncclCalloc(&listenSock, 1));
+  memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(listenSock));
+  memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
   pthread_t thread;
-
-  // [RCCL] Use the ncclUniqueId to get a hash for bootstrap
-  struct bootstrapRootStruct* rootStruct = new struct bootstrapRootStruct;
-  rootStruct->hash = djb2Hash(id->internal);
-  rootStruct->listenFd = listenFd;
-  pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
-  pthread_detach(thread); // [RCCL] Adding detach to properly clean up bootstrapRoot thread
-  // [/RCCL]
-
+  pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
+  pthread_detach(thread); // will not be pthread_join()'d
+  ncclSetThreadName(thread, "NCCL BootstrapR");
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
-  static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
   memset(id, 0, sizeof(ncclUniqueId));
-  union socketAddress* connectAddr = (union socketAddress*) id;
+  union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
 
   char* env = getenv("NCCL_COMM_ID");
   if (env) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
+    if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
       WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
       return ncclInvalidArgument;
     }
   } else {
-    memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
+    memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
     NCCLCHECK(bootstrapCreateRoot(id, false));
   }
 
@@ -223,157 +193,51 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
 struct unexConn {
   int peer;
   int tag;
-  int fd;
-  union socketAddress addr;
+  struct ncclSocket sock;
   struct unexConn* next;
 };
 
-// Remote allocator state
-struct remAllocState {
-  int cudaDev;
-  int listenFd;
-  volatile int stop;
-};
-
-struct extState {
-  int extListenFd;
-  int extRingRecvFd;
-  int extRingSendFd;
-  union socketAddress extRingRecvAddr, extRingSendAddr;
-  union socketAddress* peerCommAddresses;
-  union socketAddress* peerAllocAddresses;
+struct bootstrapState {
+  struct ncclSocket listenSock;
+  struct ncclSocket ringRecvSocket;
+  struct ncclSocket ringSendSocket;
+  union ncclSocketAddress* peerCommAddresses;
+  union ncclSocketAddress* peerProxyAddresses;
   struct unexConn* unexpectedConnections;
   int cudaDev;
   int rank;
   int nranks;
-
-  // Intermediate memory allocation service
-  struct remAllocState* allocState;
-  pthread_t allocThread;
+  volatile uint32_t *abortFlag;
 };
 
-#define MAX_SEGMENTS 128
-
-static ncclResult_t remoteAlloc(void** ptr, int fd, union socketAddress *addr) {
-  size_t size;
-  NCCLCHECK(socketRecv(fd, addr, &size, sizeof(size_t)));
-  hipIpcMemHandle_t devIpc;
-  NCCLCHECK(ncclCudaCalloc((char**)ptr, size));
-  hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
-  if (res != hipSuccess) {
-    WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
-    hipFree(*ptr);
-    CUDACHECK(res);
-  }
-  // The CUDA IPC
-  NCCLCHECK(socketSend(fd, addr, &devIpc, sizeof(hipIpcMemHandle_t)));
-  // And the direct pointer
-  NCCLCHECK(socketSend(fd, addr, ptr, sizeof(void*)));
-  return ncclSuccess;
-}
-
-#include <poll.h>
-
-// Service thread to allocate memory for other GPUs, used as intermediate step.
-void* ncclRemoteMemAllocationService(void* args) {
-  struct remAllocState* state = (struct remAllocState *) args;
-  if (hipSetDevice(state->cudaDev) != hipSuccess) {
-    WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
-  }
-
-  // Prepare poll descriptor
-  void* segments[MAX_SEGMENTS];
-  struct pollfd pollfds[MAX_SEGMENTS+1];
-  for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
-  for (int s=0; s<MAX_SEGMENTS; s++) {
-    pollfds[s].fd = -1;
-    pollfds[s].events = POLLIN;
-  }
-  pollfds[MAX_SEGMENTS].fd = state->listenFd;
-  pollfds[MAX_SEGMENTS].events = POLLIN;
-
-  int nbuffers = 0;
-  while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
-    if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
-      WARN("[Rem Allocator] Poll failed with error %d", error);
-      return NULL;
-    }
-    if (pollfds[MAX_SEGMENTS].revents) {
-      int s = 0;
-      union socketAddress addr;
-      while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
-      if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd, &addr) != ncclSuccess) {
-        pollfds[s].fd = -1;
-      } else {
-        if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd, &addr) != ncclSuccess)) {
-          WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
-          close(pollfds[s].fd);
-          pollfds[s].fd = -1;
-        } else {
-          nbuffers++;
-        }
-      }
-    }
-    for (int s=0; s<MAX_SEGMENTS; s++) {
-      if (pollfds[s].revents & (POLLIN|POLLHUP)) {
-        if (hipFree(segments[s]) != hipSuccess) {
-          WARN("[Rem Allocator] hipFree %p failed", segments[s]);
-        }
-        segments[s] = NULL;
-        close(pollfds[s].fd);
-        pollfds[s].fd = -1;
-        nbuffers--;
-      }
-    }
-  }
-  for (int s=0; s<MAX_SEGMENTS; s++) {
-    if (segments[s]) hipFree(segments[s]);
-    close(pollfds[s].fd);
-  }
-  close(state->listenFd);
-  free(state);
-  return NULL;
-}
-
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
-  struct extState* state = (struct extState*)commState;
-  int fd;
-  ncclResult_t res;
-  *id = -1;
-  union socketAddress *addr = state->peerAllocAddresses+rank;
-  NCCLCHECK(connectAddress(&fd, addr));
-  NCCLCHECKGOTO(socketSend(fd, addr, &size, sizeof(size_t)), res, end);
-  NCCLCHECKGOTO(socketRecv(fd, addr, ipc, sizeof(hipIpcMemHandle_t)), res, end);
-  NCCLCHECKGOTO(socketRecv(fd, addr, ptr, sizeof(void*)), res, end);
-  *id = fd;
-end:
-  return res;
-}
-
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
-  SYSCHECK(close(id), "close");
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
-  struct extState* state;
+ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  struct bootstrapState* state;
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
   state->nranks = nranks;
-  *commState = state;
+  state->abortFlag = comm->abortFlag;
+  comm->bootstrap = state;
 
   TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
 
   struct extInfo info = { 0 };
   info.rank = rank;
   info.nranks = nranks;
-  int tmpSendFd, tmpRecvFd;
+  struct ncclSocket sock, listenSockRoot;
+  sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag;
+  sock.asyncFlag = listenSockRoot.asyncFlag = 0;
 
-  int extListenFdRoot;
-  memcpy(&info.extAddressListen,     &bootstrapNetIfAddr, sizeof(union socketAddress));
-  memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
-  NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
-  NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
+  // Create socket for other ranks to contact me
+  memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(&state->listenSock));
+  memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+
+  // Create socket for root to contact me
+  memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(&listenSockRoot));
+  memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
 
   // stagger connection times to avoid an overload of the root
   if (nranks > 128) {
@@ -386,38 +250,36 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
   }
 
   // send info on my listening socket to root
-  union socketAddress* rootAddr = (union socketAddress*)id;
-  NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, rootAddr,  &info, sizeof(info)));
-  close(tmpSendFd);
+  memcpy(&sock.addr, id, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketConnect(&sock));
+  NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
+  close(sock.fd);
 
   // get info on my "next" rank in the bootstrap ring from root
-  union socketAddress addr;
-  NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd, &addr));
-  NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &state->extRingSendAddr, sizeof(state->extRingSendAddr)));
-  { // [RCCL] Receive PID from root
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, rootPid, sizeof(int)));
-  } // [/RCCL]
-  close(tmpRecvFd);
-  close(extListenFdRoot);
+  NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
+  NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
+  close(sock.fd);
+  close(listenSockRoot.fd);
 
-  NCCLCHECK(connectAddress(&state->extRingSendFd, &state->extRingSendAddr));
+  NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
   // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd, &state->extRingRecvAddr));
+  NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
 
   // AllGather all listen handlers
   NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
-  memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
-  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
+  memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
 
-  // Create the memory allocation service
-  NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
-  memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
-  NCCLCHECK(ncclCalloc(&state->allocState, 1));
-  CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
-  NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
-  pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
-  NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
+  // Create the service proxy
+  NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+  struct ncclSocket* proxySocket;
+  NCCLCHECK(ncclCalloc(&proxySocket, 1));
+  proxySocket->abortFlag = NULL; // proxy is aborted through a message
+  memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(proxySocket));
+  memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
+  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
 
   TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
@@ -425,7 +287,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
 }
 
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   char* data = (char*)allData;
   int rank = state->rank;
   int nranks = state->nranks;
@@ -441,9 +303,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
     size_t sslice = (rank - i + nranks) % nranks;
 
     // Send slice to the right
-    NCCLCHECK(bootstrapNetSend(state->extRingSendFd, &state->extRingSendAddr, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
     // Recv slice from the left
-    NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, &state->extRingRecvAddr, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -451,14 +313,15 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 }
 
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
-  struct extState* state = (struct extState*)commState;
-  int tmpSendFd;
-  union socketAddress *addr = state->peerCommAddresses+peer;
-  NCCLCHECK(connectAddress(&tmpSendFd, addr));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &state->rank, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &tag, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, data, size));
-  close(tmpSendFd);
+  struct bootstrapState* state = (struct bootstrapState*)commState;
+  struct ncclSocket sock;
+  sock.abortFlag = state->abortFlag;
+  memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketConnect(&sock));
+  NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(&sock, data, size));
+  close(sock.fd);
   return ncclSuccess;
 }
 
@@ -499,14 +362,13 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
   return ncclSuccess;
 }
 
-ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) {
+ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
   // New unex
   struct unexConn* unex;
   NCCLCHECK(ncclCalloc(&unex, 1));
   unex->peer = peer;
   unex->tag = tag;
-  unex->fd = fd;
-  unex->addr = *addr;
+  memcpy(&unex->sock, sock, sizeof(struct ncclSocket));
 
   // Enqueue
   struct unexConn* list = state->unexpectedConnections;
@@ -519,7 +381,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd
   return ncclSuccess;
 }
 
-int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAddress *addr) {
+ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
   struct unexConn* elem = state->unexpectedConnections;
   struct unexConn* prev = NULL;
   while (elem) {
@@ -529,81 +391,72 @@ int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAdd
       } else {
         prev->next = elem->next;
       }
-      int fd = elem->fd;
-      *addr = elem->addr;
+      memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
       free(elem);
-      return fd;
+      return ncclSuccess;
     }
     prev = elem;
     elem = elem->next;
   }
-  return -1;
+  sock->fd = -1;
+  return ncclSuccess;
 }
 
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
 
-  int tmpRecvFd;
-  union socketAddress addr;
+  struct ncclSocket sock;
+  sock.abortFlag = state->abortFlag;
 
   // Search unexpected connections first
-  if ((tmpRecvFd = unexpectedDequeue(state, peer, tag, &addr)) != -1) {
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
-    close(tmpRecvFd);
+  NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
+  if (sock.fd != -1) {
+    NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+    close(sock.fd);
     return ncclSuccess;
   }
 
   // Then look for new connections
   while (1) {
-    union socketAddress addr;
-    NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd, &addr));
+    NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
     int newPeer, newTag;
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newPeer, sizeof(int)));
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newTag, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
     if (newPeer == peer && newTag == tag) {
-      NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
-      close(tmpRecvFd);
+      NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+      close(sock.fd);
       return ncclSuccess;
     }
     // Unexpected connection. Save for later.
-    NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd, &addr));
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
   }
 }
 
 ncclResult_t bootstrapClose(void* commState) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   if (state->unexpectedConnections != NULL) {
     WARN("Unexpected connections are not empty");
     return ncclInternalError;
   }
-  close(state->extListenFd);
-  close(state->extRingSendFd);
-  close(state->extRingRecvFd);
-
-  state->allocState->stop = 1;
-
-  // Join the allocThread so we catch resource leaks as being hung here
-  // [RCCL] Uncommenting this join to clean up the allocThread
-  pthread_join(state->allocThread, nullptr);
-  // [/RCCL]
+  close(state->listenSock.fd);
+  close(state->ringSendSocket.fd);
+  close(state->ringRecvSocket.fd);
 
   free(state->peerCommAddresses);
-  free(state->peerAllocAddresses);
   free(state);
 
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapAbort(void* commState) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   if (commState == NULL) return ncclSuccess;
-  if (state->extListenFd) close(state->extListenFd);
-  if (state->extRingSendFd) close(state->extRingSendFd);
-  if (state->extRingRecvFd) close(state->extRingRecvFd);
-  if (state->allocState) state->allocState->stop = 2;
+  if (state->listenSock.fd) close(state->listenSock.fd);
+  if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
+  if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
   free(state->peerCommAddresses);
-  free(state->peerAllocAddresses);
+  free(state->peerProxyAddresses);
   free(state);
   return ncclSuccess;
 }
diff --git a/src/channel.cc b/src/channel.cc
index 5fa25c7bce..e9cfa6664f 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -65,13 +65,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   for (int r=0; r<nRanks+1; r++) {
     struct ncclPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->send[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources));
+      if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
     }
   }
   for (int r=0; r<nRanks+1; r++) {
     struct ncclPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->recv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources));
+      if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
     }
   }
 
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 7a1e3e1da7..86ed853632 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,9 +13,9 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem->channel.ring;
     const int *ringRanks = ring->devUserRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
@@ -23,12 +23,12 @@ namespace {
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
     const int nranks = ncclShmem->comm.nRanks;
     const ssize_t loopSize = nChannels*int(chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -37,7 +37,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
       realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index a22caaa989..e7c3c28cfb 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
-*  Modifications Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
  * See LICENSE.txt for license information
  ************************************************************************/
 
@@ -8,7 +8,4 @@
 #include "common.h"
 #include "collectives.h"
 
-// [RCCL]
-// IMPL_COLL_R(AllReduce);
-IMPL_COLL_CLIQUE(AllReduce);
-// [/RCCL]
+IMPL_COLL_R(AllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 6398388ed9..c92ce89d33 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,26 +8,26 @@
 #include "devcomm.h"
 #include "collectives.h"
 #include "primitives.h"
-#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
+//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem->channel.ring;
     int ringIx = ring->index;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
     const int nranks = ncclShmem->comm.nRanks;
     const ssize_t loopSize = nChannels*nranks*chunkSize;
-    const ssize_t size = args->coll.count;
 #ifdef ENABLE_PROFILING
     auto devProf = ncclShmem->comm.devProf;
     uint64_t clk, t0 = 0ULL, ws;
     if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
 #endif
+    const ssize_t size = args->count;
 
     int minChunkSize;
     if (Proto::Id == NCCL_PROTO_LL)
@@ -37,8 +37,8 @@ namespace {
       minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
     }
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -110,32 +110,35 @@ namespace {
       ACCUMULATE_COUNTER(directRecv);
     }
 #ifdef ENABLE_PROFILING
-    if (tid == 0 && args->coll.opCount) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
+    if (tid == 0) {
+      struct ncclProfElem *elem = devProf.elems+args->opCount;
+      elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
+    }
 #endif
   }
 
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem->channel.tree;
     ssize_t chunkSize = int(
-      Proto::Id == NCCL_PROTO_SIMPLE ? args->coll.lastChunkSize
+      Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
                    /* LL & LL128 */  : Proto::calcBytePerStep()/sizeof(T));
     const ssize_t minChunkSize = int(
       Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T))
                    /* LL & LL128 */  : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = int(nChannels*chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     if (loopSize > size)
       chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
 
     { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto> prims
-        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
+        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -160,8 +163,8 @@ namespace {
     }
 
     { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto> prims
-        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0> prims
+        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -189,19 +192,19 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem->channel.tree;
     ssize_t chunkSize = int(
-      Proto::Id != NCCL_PROTO_LL ? args->coll.lastChunkSize
+      Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
                                  : Proto::calcBytePerStep()/sizeof(T));
     const ssize_t minChunkSize = int(
       Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) :
       Proto::Id == NCCL_PROTO_LL     ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
                    /* LL128 */       : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
     const ssize_t loopSize = int(nChannels*chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     int nthreadsSplit;
     if (Proto::Id == NCCL_PROTO_SIMPLE) {
@@ -218,8 +221,8 @@ namespace {
 
     if (tree->up == -1) {
       // Reduce and broadcast. Max number of recv is 3, max number of send is 3
-      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto>
-        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*int(chunkSize);
         int nelem = min(chunkSize, size-offset);
@@ -235,8 +238,8 @@ namespace {
        * into DirectRecv and DirectSend capabilities, this ctor would have both=0,
        * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
        */
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -254,8 +257,8 @@ namespace {
     }
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth);
+      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -294,11 +297,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
   __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
     static constexpr int COLLNET_COPY_THREADS = 64;
     const int tid = threadIdx.x;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     struct ncclDirect* tree = &ncclShmem->channel.collTree;
-    const ssize_t chunkSize = int(args->coll.lastChunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
 
     const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
@@ -306,7 +309,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     const int nThreadsScatter = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
     const int nThreadsGather  = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
     const int nThreadsBcast   = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->nThreads - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -316,8 +319,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
       int group = (2*Proto::MaxGroupWidth) | (1<<16);
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -331,8 +334,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       int group = (3*Proto::MaxGroupWidth) | (1<<16);
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -344,8 +347,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
         }
       } else {
         // Directly send to network
-        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -355,8 +358,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     } else if (tid < tidStartBcast && hasUp) {
       // Gather
       int group = (0*Proto::MaxGroupWidth) | (0<<16);
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto>
-        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
+        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -366,8 +369,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       int group = (1*Proto::MaxGroupWidth) | (0<<16);
       if (hasDn) {
         // Recv from network, broadcast
-        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -375,8 +378,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
         }
       } else {
         // Recv from network (no post thread needed)
-        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -404,13 +407,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
   __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
-    LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
+    runRing<T, RedOp, ProtoLL128>(args);
+    //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
   __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
-    LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
+    runTreeSplit<T, RedOp, ProtoLL128>(args);
+    //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
   }
 };
diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h
index a6740cd3e5..4b00ddbca8 100644
--- a/src/collectives/device/alltoall_pivot.h
+++ b/src/collectives/device/alltoall_pivot.h
@@ -12,24 +12,24 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
     const int nranks = ncclShmem->comm.nRanks;
     const ncclRing *ring = &ncclShmem->channel.ring;
-    const int num_bi_rings = args->coll.pivotA2ANumBiRings;
+    const int num_bi_rings = args->pivotA2ANumBiRings;
     const int num_uni_rings = num_bi_rings * 2;
-    const int num_chunks = args->coll.nChannels / 2;
+    const int num_chunks = args->nChannels / 2;
     const int chunk_id = (bid % num_bi_rings) + (bid / num_uni_rings * num_bi_rings);
-    const int elem_size = args->coll.count % 256 ? 1 : 256;
-    const ssize_t num_elems = args->coll.count / elem_size;
+    const int elem_size = args->count % 256 ? 1 : 256;
+    const ssize_t num_elems = args->count / elem_size;
     const int num_padding_chunks = num_elems % num_chunks;
     const ssize_t chunk_offset = elem_size * (num_elems / num_chunks * chunk_id + (chunk_id < num_padding_chunks ? chunk_id : num_padding_chunks));
     const ssize_t chunk_size = elem_size * (num_elems / num_chunks + (chunk_id < num_padding_chunks ? 1 : 0));
     const int pivot_direction = (bid % num_uni_rings) / num_bi_rings;
     const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->connIndex << 16);
 
     for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) {
       const int src_rank = ring->devUserRanks[(nranks - num_hops) % nranks];
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index be01d90836..a97836c672 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,27 +12,27 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem->channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
     const int rank = ring->devUserRanks[0];
     const int nextRank = ring->devUserRanks[1];
-    const int root = args->coll.root;
 #ifdef ENABLE_PROFILING
     auto devProf = ncclShmem->comm.devProf;
     uint64_t clk, t0 = 0ULL, ws;
     if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
 #endif
+    const int root = args->root;
 
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -41,7 +41,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
       realChunkSize = int(realChunkSize);
@@ -70,7 +70,10 @@ namespace {
       }
     }
 #ifdef ENABLE_PROFILING
-    if (tid == 0 && args->coll.opCount) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
+    if (tid == 0) {
+      struct ncclProfElem *elem = devProf.elems+args->opCount;
+      elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
+    }
 #endif
   }
 }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index b484e0e3c7..084612f9a2 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,6 +10,7 @@
 
 #include "collectives.h"
 #include "devcomm.h"
+#include "op128.h"
 
 #define COLL_UNROLL 2
 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree
@@ -67,40 +68,8 @@
   NCCL_FUNCS3B(func, Sum), \
   NCCL_FUNCS3B(func, Sum)
 
-// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
-#define NCCL_FUNC5B(func, algo, devredop, type, nullify) \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
-
-#define NCCL_FUNC4B(func, devredop, type, nullify) \
-  NCCL_FUNC5B(func, TREE,    devredop, type, nullify), \
-  NCCL_FUNC5B(func, RING,    devredop, type, nullify), \
-  NCCL_FUNC5B(func, COLLNET, devredop, type, nullify)
-
-#define NCCL_FUNCS3C(func, devredop, nullForFloat) \
-  NCCL_FUNC4B(func, devredop, int8_t, 0), \
-  NCCL_FUNC4B(func, devredop, uint8_t, 0), \
-  NCCL_FUNC4B(func, devredop, int32_t, 0), \
-  NCCL_FUNC4B(func, devredop, uint32_t, 0), \
-  NCCL_FUNC4B(func, devredop, int64_t, 0), \
-  NCCL_FUNC4B(func, devredop, uint64_t, 0), \
-  NCCL_FUNC4B(func, devredop, half, nullForFloat), \
-  NCCL_FUNC4B(func, devredop, float, nullForFloat), \
-  NCCL_FUNC4B(func, devredop, double, nullForFloat), \
-  NCCL_FUNC4B(func, devredop, rccl_bfloat16, nullForFloat)
-
-#define NCCL_FUNCS2C(func) \
-  NCCL_FUNCS3C(func, Sum,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3C(func, Prod,       /*nullForFloat=*/0), \
-  NCCL_FUNCS3C(func, Max,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3C(func, Min,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3C(func, PreMulSum,  /*nullForFloat=*/0), \
-  NCCL_FUNCS3C(func, SumPostDiv, /*nullForFloat=*/1)
-
-
 // Must be consistent with the ncclFuncSet enum
-using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
+using ncclKernelFunc_t = void (*)();
 
 static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 // Don't try to initialize the host shadow copy of this device-side global
@@ -108,13 +77,13 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 // confuses clang. This will be fixed in the next clang release.
 #if defined(__HIP_DEVICE_COMPILE__)
 #if defined(BUILD_ALLREDUCE_ONLY)
-  NCCL_FUNC4B(AllReduce, Sum, float, 0),
+  NCCL_FUNC4(AllReduce, Sum, float, 0),
 #else
   NCCL_FUNCS2B(Broadcast),
   NCCL_FUNCS2A(Reduce),
   NCCL_FUNCS2B(AllGather),
   NCCL_FUNCS2A(ReduceScatter),
-  NCCL_FUNCS2C(AllReduce),
+  NCCL_FUNCS2A(AllReduce),
   NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
   NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
   NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
@@ -136,18 +105,18 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 template<unsigned short f, unsigned short l>
 struct Caller {
   static __device__ __host__
-  void call(struct ncclWorkElem* const c) noexcept
+  void call(unsigned short funcIndex) noexcept
   {
     constexpr unsigned short m = f + (l - f) / 2;
 
-     return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
+     return (funcIndex < m) ? Caller<f, m>::call(funcIndex) : Caller<m, l>::call(funcIndex);
   }
 };
 
 template<unsigned short f>
 struct Caller<f, f + 1>{
   static __device__ __host__
-  void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
+  void call(unsigned short funcIndex) noexcept { ncclFuncs[f](); }
 };
 
 static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index");
@@ -155,86 +124,86 @@ static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function i
 
 inline
 __device__
-void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
+void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
 #if defined(BUILD_ALLREDUCE_ONLY)
-  if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE))
-    ncclFunction_AllReduce_RING_SIMPLE_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL))
-    ncclFunction_AllReduce_RING_LL_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128))
-    ncclFunction_AllReduce_RING_LL128_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE))
-    ncclFunction_AllReduce_TREE_SIMPLE_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL))
-    ncclFunction_AllReduce_TREE_LL_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE))
-    ncclFunction_AllReduce_COLLNET_SIMPLE_Sum_float(c);
-  else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL))
-    ncclFunction_AllReduce_COLLNET_LL_Sum_float(c);
+  if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE))
+    ncclFunction_AllReduce_RING_SIMPLE_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL))
+    ncclFunction_AllReduce_RING_LL_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128))
+    ncclFunction_AllReduce_RING_LL_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE))
+    ncclFunction_AllReduce_TREE_SIMPLE_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL))
+    ncclFunction_AllReduce_TREE_LL_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE))
+    ncclFunction_AllReduce_COLLNET_SIMPLE_Sum_float();
+  else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL))
+    ncclFunction_AllReduce_COLLNET_LL_Sum_float();
   else
     assert("Unsupported function index");
 #else
-  if (c->funcIndex < 540) {
-    if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
-    else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
+  if (funcIndex < 540) {
+    if (funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t();
+    else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t();
   }
-  else if (c->funcIndex < 1080) Caller<540, 1080>::call(c);
-  else if (c->funcIndex < 1620) {
-    if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
-    else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
-    else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
+  else if (funcIndex < 1080) Caller<540, 1080>::call(funcIndex);
+  else if (funcIndex < 1620) {
+    if (funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t();
+    else if (funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t();
+    else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t();
   }
-  else if (c->funcIndex < 2700) Caller<1620, 2700>::call(c);
+  else if (funcIndex < 2700) Caller<1620, 2700>::call(funcIndex);
   else {
-    switch (c->funcIndex - 2700) {
+    switch (funcIndex - 2700) {
       case 0:
-        ncclFunction_OneRankReduce_PreMulSum_int8_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_int8_t();
         break;
       case 1:
-        ncclFunction_OneRankReduce_PreMulSum_uint8_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_uint8_t();
         break;
       case 2:
-        ncclFunction_OneRankReduce_PreMulSum_int32_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_int32_t();
         break;
       case 3:
-        ncclFunction_OneRankReduce_PreMulSum_uint32_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_uint32_t();
         break;
       case 4:
-        ncclFunction_OneRankReduce_PreMulSum_int64_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_int64_t();
         break;
       case 5:
-        ncclFunction_OneRankReduce_PreMulSum_uint64_t(c);
+        ncclFunction_OneRankReduce_PreMulSum_uint64_t();
         break;
       case 6:
-        ncclFunction_OneRankReduce_PreMulSum_half(c);
+        ncclFunction_OneRankReduce_PreMulSum_half();
         break;
       case 7:
-        ncclFunction_OneRankReduce_PreMulSum_float(c);
+        ncclFunction_OneRankReduce_PreMulSum_float();
         break;
       case 8:
-        ncclFunction_OneRankReduce_PreMulSum_double(c);
+        ncclFunction_OneRankReduce_PreMulSum_double();
         break;
       case 9:
-        ncclFunction_OneRankReduce_PreMulSum_rccl_bfloat16(c);
+        ncclFunction_OneRankReduce_PreMulSum_rccl_bfloat16();
         break;
       case 10:
-        ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
+        ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t();
         break;
       case 11:
-        ncclFunction_AllToAllPivot_RING_SIMPLE_Sum_int8_t(c);
+        ncclFunction_AllToAllPivot_RING_SIMPLE_Sum_int8_t();
       default:
         break;
     }
@@ -249,45 +218,49 @@ class ncclFunction {
 };
 
 #ifdef ENABLE_COLLTRACE
-#define traceColl(fIdx)  \
-    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-    shmem.comm.collTrace[pos].bid = bid; \
-    shmem.comm.collTrace[pos].funcIndex = fIdx; \
-    if (fIdx == FUNC_INDEX_P2P) { \
-      shmem.comm.collTrace[pos].opCount = elems[0].p2p.opCount; \
-      shmem.comm.collTrace[pos].p2p.nThreads = elems[0].p2p.nThreads; \
-      shmem.comm.collTrace[pos].p2p.delta = (uint16_t)(elems[0].p2p.delta); \
+#define traceColl(elem,launch_type) \
+    uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    ncclShmem->comm.collTrace[pos].bid = blockIdx.x; \
+    ncclShmem->comm.collTrace[pos].funcIndex = ncclShmem->work.header.funcIndex; \
+    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (ncclShmem->comm.collTrace[pos].data_0)); \
+    if (elem.header.type == ncclWorkTypeP2p) { \
+      struct ncclWorkElemP2p *p2pElems = (struct ncclWorkElemP2p *)&elem; \
+      ncclShmem->comm.collTrace[pos].p2p[0].connIndex = p2pElems[0].connIndex; \
+	    ncclShmem->comm.collTrace[pos].p2pOpCount[0] = p2pElems[0].opCount; \
+      ncclShmem->comm.collTrace[pos].p2p[0].ngroups = p2pElems[0].ngroups; \
+      ncclShmem->comm.collTrace[pos].p2p[0].nWarps = p2pElems[0].nWarps; \
+      ncclShmem->comm.collTrace[pos].p2p[0].warpStart = p2pElems[0].warpStart; \
+      ncclShmem->comm.collTrace[pos].p2p[0].peer = (uint16_t)(p2pElems[0].peer); \
+	    ncclShmem->comm.collTrace[pos].p2p[1].connIndex = p2pElems[1].connIndex; \
+      ncclShmem->comm.collTrace[pos].p2pOpCount[1] = p2pElems[1].opCount; \
+      ncclShmem->comm.collTrace[pos].p2p[1].ngroups = p2pElems[1].ngroups; \
+      ncclShmem->comm.collTrace[pos].p2p[1].nWarps = p2pElems[1].nWarps; \
+      ncclShmem->comm.collTrace[pos].p2p[1].warpStart = p2pElems[1].warpStart; \
+      ncclShmem->comm.collTrace[pos].p2p[1].peer = (uint16_t)(p2pElems[1].peer); \
+      ncclShmem->comm.collTrace[pos].type = (ncclCollTraceP2pElemType|launch_type); \
     } else { \
-      shmem.comm.collTrace[pos].opCount = elems[0].coll.opCount; \
-      shmem.comm.collTrace[pos].coll.nThreads = elems[0].nThreads; \
-      shmem.comm.collTrace[pos].coll.bid = elems[0].coll.bid; \
-      shmem.comm.collTrace[pos].coll.nChannels = elems[0].coll.nChannels; \
+      ncclShmem->comm.collTrace[pos].opCount = elem.opCount; \
+      ncclShmem->comm.collTrace[pos].coll.nWarps = elem.header.nWarps; \
+      ncclShmem->comm.collTrace[pos].coll.bid = elem.bid; \
+      ncclShmem->comm.collTrace[pos].coll.nChannels = elem.nChannels; \
+      ncclShmem->comm.collTrace[pos].type = (ncclCollTraceCollElemType|launch_type); \
     }
-#define traceKernelLaunch(fIdx)  { \
-    if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \
-      traceColl(fIdx); \
-      asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (shmem.comm.collTrace[pos].data_0)); \
-      shmem.comm.collTrace[pos].type = ncclCollTraceKernelLaunchType; \
-    } \
+
+#define traceKernelLaunch(elem,firstLaunch)  { \
+    traceColl(elem,(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType)); \
   }
-#define traceCollEnd(fIdx)  { \
-    if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \
-      traceColl(fIdx); \
-      shmem.comm.collTrace[pos].type = ncclCollTraceCollEndType; \
-    } \
+#define traceKernelEnd()  { \
+    uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    ncclShmem->comm.collTrace[pos].bid = bid; \
+    ncclShmem->comm.collTrace[pos].type = ncclCollTraceKernelEndType; \
   }
-#define traceKernelEnd(fIdx)  { \
-    if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \
-      traceColl(fIdx); \
-      shmem.comm.collTrace[pos].type = ncclCollTraceKernelEndType; \
-    } \
-  }
-#define traceAbort(fIdx)  { \
-    if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \
-      traceColl(fIdx); \
-      shmem.comm.collTrace[pos].type = ncclCollTraceAbortType; \
-    } \
+#define traceAbort()  { \
+    uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    ncclShmem->comm.collTrace[pos].bid = bid; \
+    ncclShmem->comm.collTrace[pos].type = ncclCollTraceAbortType; \
   }
 //  traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
 #define traceData(data2, data4, data8_0, data8_1) { \
@@ -301,9 +274,8 @@ class ncclFunction {
     ncclShmem->comm.collTrace[pos].type = ncclCollTraceDataType; \
   }
 #else
-#define traceKernelLaunch(fIdx)
-#define traceCollEnd(fIdx)
-#define traceAbort(fIdx)
+#define traceKernelLaunch()
+#define traceAbort()
 #define traceData(data2, data4, data8_0, data8_1)
 #endif
 
@@ -313,6 +285,28 @@ __device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) {
   return atomicAdd(abortCount, 0) != 0;
 }
 
+// Copy src to dst and fill extra size with zeroes
+template<typename Tdst, typename Tsrc>
+__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
+  static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
+      "copyToShmem needs sizes which are multiple of 16B");
+  static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
+  static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
+  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
+  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
+  uint64_t *shmemPtr = d;
+  int offset = 2*tid;
+  uint64_t v0, v1;
+  if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
+    v0 = v1 = 0ULL;
+  } else {
+    v0 = s[offset] ; v1 = s[offset+1];
+  }
+  if (offset < sizeof(Tdst)/sizeof(uint64_t)) {
+    shmemPtr[offset] = v0; shmemPtr[offset+1] = v1;
+  }
+}
+
 template<typename T>
 __device__ int copyToShmem(T *dst, T const *src, int turn=0) {
   static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
@@ -352,47 +346,21 @@ struct RunWorkElement {
   }
 };
 
-#if CUDART_VERSION >= 11030
-__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#else
-static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#endif
-{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR};
-
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
 struct RunWork {
   // This __forceinline__ is necessary. The compiler was inserting a function call
   // here from the LL ncclKernel.
   __device__ __forceinline__ void run(ncclWork *w) {
-    int tid = threadIdx.x;
-    /* Some invariants that must hold:
-     * 1. All elems[] have same funcIndex.
-     * 2. All elems[] have same nThreads.
-     * 3. The thread-to-group relation (as in prims group numbers) is the same
-     *    for all elems[].
-     *
-     * If (1) isn't true then we might be in the wrong function since dispatch
-     * on ncclFuncs[w->funcIndex] is how we got here.
-     *
-     * If (2) or (3) aren't true, then threads from different work elements
-     * could race for barrier resources (barrier numbers 0...15) which is fatal.
-     *
-     * IMPORTANT!!! To ensure (3), implementations of
-     * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use the following
-     * when deciding how to map threads to groups:
-     *    Fn, T, RedOp, Algo, Proto, nThreads
-     *
-     * This last one is difficult to enforce so I hope everyone reads this.
-     */
-    if (tid < w->elems[0].nThreads) {
-      #pragma unroll 1
-      for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo])
+    int wid = threadIdx.x / WARP_SIZE;
+    int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
+    #pragma unroll 1
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
+      if (wid < w->header.nWarps)
         RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
     }
   }
 };
 
-#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
 struct ncclShmemGroup {
   ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
   ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY];
@@ -400,7 +368,7 @@ struct ncclShmemGroup {
   void* dsts[NCCL_MAX_DIRECT_ARITY+1];
   int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK];
   uint64_t barrier;
-  uint64_t barrier_next[MAXWARPS];
+  uint64_t barrier_next[NCCL_MAX_GROUPS];
 };
 
 struct ncclShmemData {
@@ -408,18 +376,41 @@ struct ncclShmemData {
     uint64_t ll128warp[NCCL_MAX_GROUPS][NCCL_MAX_GROUPS];
     struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   };
-  uint32_t sync[MAXWARPS];
+  uint32_t sync[NCCL_MAX_GROUPS];
   uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
-  ncclDevComm comm;
-  ncclChannel channel;
-  ncclWork work;
+  struct ncclDevComm comm;
+  struct ncclChannel channel;
+  uint64_t pad;
+  struct ncclWork work;
 };
 
+static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
+  if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+    /* redOpArg is a pointer to the scalar value, so we'll dereference it
+     * here so that redOpArg holds the bits of the scalar going forward.
+     * The tricky thing is we don't know its type T since that's encoded in
+     * the funcIndex. Because it would be difficult to get sizeof(T) from
+     * funcIndex, we'll cheat and just dereference the largest possible size
+     * given the alignment of the pointer. We might be reading in more bytes
+     * than we need but that's harmless.
+     */
+    if (we->redOpArg%2 != 0)
+      we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
+    else if (we->redOpArg%4 != 0)
+      we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
+    else if (we->redOpArg%8 != 0)
+      we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
+    else
+      we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
+  }
+}
+
 extern __device__ struct ncclShmemData *ncclShmem;
 
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE>
-__device__ void ncclKernel(ncclWorkElem first)  {
+__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
   int tid = threadIdx.x;
+  int nthreads = blockDim.x;
   int bid = blockIdx.x;
   __shared__ struct ncclShmemData shmem;
   ncclShmem = &shmem;
@@ -428,110 +419,90 @@ __device__ void ncclKernel(ncclWorkElem first)  {
     abortCount = 0;
     for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
       shmem.groups[i].barrier = 0;
-      for (auto j = 0; j < MAXWARPS; j++) shmem.groups[i].barrier_next[j] = 0;
+      for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
     }
   }
   __syncthreads();
 
-  int turn = copyToShmem(&shmem.comm, first.comm);
+  int turn = copyToShmem(&ncclShmem->comm, comm);
   // get address of channel without incurring indirect load from ncclDevCom::channels
-  ncclChannel *channel = &((ncclDevCommAndChannels*)first.comm)->channels[bid];
-  turn = copyToShmem(&shmem.channel, channel, turn);
+  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
+  turn = copyToShmem(&ncclShmem->channel, channel, turn);
 
   // To optimize for latency, (only) the first operation is passed as argument.
-  if (bid == 0 && first.active != 0) {
-    turn = copyToShmem(&shmem.work.elems[0], &first, turn);
-    if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
-      shmem.work.elems[tid].active = 0;
-      shmem.work.elems[tid].redOpArgIsPtr = 0;
-    }
+  if (bid == 0 && first.header.type != ncclWorkTypeUnused) {
+    // Copy first elem to work and zero out the rest
+    copyToShmem(&ncclShmem->work, &first, tid, nthreads);
   }
-  struct ncclWorkElem* elems = shmem.work.elems;
-  __syncthreads(); // publish shmem
+  __syncthreads(); // publish ncclShmem
 
-  ncclWork *workFifoHost = shmem.channel.workFifo;
-  ncclWork *workFifoDev = shmem.channel.workFifoDev;
-  int workFifoIx = shmem.channel.index;
+  ncclWork *workFifoHost = ncclShmem->channel.workFifo;
+  ncclWork *workFifoDev = ncclShmem->channel.workFifoDev;
+  int workFifoIx = ncclShmem->channel.index;
 
   bool skipLoadWork = false, firstLaunch = true;
-  if (bid == 0 && first.active != 0)
+  if (bid == 0 && first.header.type != ncclWorkTypeUnused)
     skipLoadWork = true;
 
   while (true) {
     if (!skipLoadWork) {
-      copyToShmem(&shmem.work, &workFifoDev[workFifoIx]); // turn no longer helps
-      // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *shmem.comm.abortFlag : 0;
-      if (barrierReduceAny(aborted, &abortCount)) { // publish shmem.work
-        if (COLLTRACE && tid == 0) traceAbort(elems->funcIndex);
-        break;
+      copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads);
+      { // Check whether the last operation was aborted and make sure all threads exit
+        int aborted = tid == 0 ? *comm->abortFlag : 0;
+        if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work
+          if (COLLTRACE && tid == 0) traceAbort();
+          break;
+        }
+        if (tid == 0)
+          workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
       }
-      if (tid == 0)
-        workFifoHost[workFifoIx].elems[0].active = 0;
-      if (COLLTRACE && tid == 0) {
-        if (firstLaunch) traceKernelLaunch(elems->funcIndex);
-        if (!firstLaunch) traceCollEnd(elems->funcIndex);
-        firstLaunch = false;
-      }
-    } else if (COLLTRACE && tid == 0) {
-        traceKernelLaunch(elems->funcIndex);
-        firstLaunch = false;
     }
 
     workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
     if (tid == 0)
       channel->index = workFifoIx; // write back to real channel, not shmem shadow
 
-    if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
-      ncclWorkElem *we = &shmem.work.elems[tid];
-      if (we->redOpArgIsPtr && we->active != 0) {
-        /* redOpArg is a pointer to the scalar value, so we'll dereference it
-         * here so that redOpArg holds the bits of the scalar going forward.
-         * The tricky thing is we don't know its type T since that's encoded in
-         * the funcIndex. Because it would be difficult to get sizeof(T) from
-         * funcIndex, we'll cheat and just dereference the largest possible size
-         * given the alignment of the pointer. We might be reading in more bytes
-         * than we need but that's harmless.
-         */
-        if (we->coll.redOpArg%2 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint8_t*>(we->coll.redOpArg);
-        else if (we->coll.redOpArg%4 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint16_t*>(we->coll.redOpArg);
-        else if (we->coll.redOpArg%8 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint32_t*>(we->coll.redOpArg);
-        else
-          we->coll.redOpArg = *reinterpret_cast<uint64_t*>(we->coll.redOpArg);
-      }
+    __syncwarp();
+    if (ncclShmem->work.header.type == ncclWorkTypeColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem->work.elems[tid]);
+    } else if (ncclShmem->work.header.type == ncclWorkTypeRegColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem->work.regElems[tid].elem);
     }
     __syncthreads();
 
-    if (shmem.work.elems[0].funcIndex == FnIndex)
-      RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
-    else
-      NCCL_CALL_FUNCTIONS(&elems[0]);
-
-    if (shmem.work.elems[0].active == 2) {
-      if (COLLTRACE && tid == 0) traceKernelEnd(elems->funcIndex)
-      break;
+    if (COLLTRACE && tid == 0) {
+      traceKernelLaunch(ncclShmem->work.elems[0],firstLaunch);
+      firstLaunch = false;
+      #pragma unroll 1
+      for(int e=1; e < NCCL_MAX_WORK_ELEMENTS && ncclShmem->work.elems[e].header.type != ncclWorkTypeUnused; e ++) {
+        traceColl(ncclShmem->work.elems[e], 0);
+      }
     }
+    if (ncclShmem->work.header.funcIndex == FnIndex)
+      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem->work);
+    else
+      NCCL_CALL_FUNCTIONS(ncclShmem->work.header.funcIndex);
+
+    if (ncclShmem->work.header.isLast) break;
     __syncthreads();
     skipLoadWork = false;
   }
+  if (COLLTRACE && tid == 0) traceKernelEnd()
 }
 
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \
-  if (first.comm->collTraceThread) \
-    ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(first); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, ncclWorkElem first) { \
+  if (comm->collTraceThread) \
+    ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, first); \
   else \
-    ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(first); \
+    ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, first); \
 }
 
 // Examples :     AllReduce, RING, LL,    Sum,   uint8
 /* Functions for aggregation case */
 #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
-__device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(struct ncclWorkElem* args) { \
+__device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
   RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \
 }
 
@@ -574,46 +545,6 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
   IMPL_COLL2(func, PreMulSum) \
   IMPL_COLL2A(func, SumPostDiv)
 
-// [RCCL] Define clique-based implementations (repurposed LL128)
-#define IMPL_COLL4_CLIQUE(func, algo, devredop, type, ncclType) \
-  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
-  IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
-  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \
-
-#define IMPL_COLL3_CLIQUE(func, devredop, type, ncclType) \
-  IMPL_COLL4_CLIQUE(func, TREE,    devredop, type, ncclType) \
-  IMPL_COLL4_CLIQUE(func, RING,    devredop, type, ncclType) \
-  IMPL_COLL4_CLIQUE(func, COLLNET, devredop, type, ncclType)
-
-#define IMPL_COLL2_CLIQUE(func, devredop) \
-  IMPL_COLL3_CLIQUE(func, devredop, int8_t,   ncclInt8) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint8_t,  ncclUint8) \
-  IMPL_COLL3_CLIQUE(func, devredop, int32_t,  ncclInt32) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint32_t, ncclUint32) \
-  IMPL_COLL3_CLIQUE(func, devredop, int64_t,  ncclInt64) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint64_t, ncclUint64) \
-  IMPL_COLL3_CLIQUE(func, devredop, half,     ncclFloat16) \
-  IMPL_COLL3_CLIQUE(func, devredop, float,    ncclFloat32) \
-  IMPL_COLL3_CLIQUE(func, devredop, double,   ncclFloat64) \
-  IMPL_COLL3_CLIQUE(func, devredop, rccl_bfloat16, ncclBfloat16)
-
-#define IMPL_COLL2A_CLIQUE(func, devredop) \
-  IMPL_COLL3_CLIQUE(func, devredop, int8_t,   ncclInt8) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint8_t,  ncclUint8) \
-  IMPL_COLL3_CLIQUE(func, devredop, int32_t,  ncclInt32) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint32_t, ncclUint32) \
-  IMPL_COLL3_CLIQUE(func, devredop, int64_t,  ncclInt64) \
-  IMPL_COLL3_CLIQUE(func, devredop, uint64_t, ncclUint64)
-
-#define IMPL_COLL_CLIQUE(func) \
-  IMPL_COLL2_CLIQUE(func, Sum) \
-  IMPL_COLL2_CLIQUE(func, Prod) \
-  IMPL_COLL2_CLIQUE(func, Min) \
-  IMPL_COLL2_CLIQUE(func, Max) \
-  IMPL_COLL2_CLIQUE(func, PreMulSum) \
-  IMPL_COLL2A_CLIQUE(func, SumPostDiv)
-// [/RCCL]
-
 // Copy primitives only define one function for copy
 #define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
 
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 0349249e1a..d6fa08f186 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -17,9 +17,15 @@
 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 
-template <typename T>
-inline __device__ void loadPtr(void** ptr, T* &v) {
+inline __device__ int loadInt(int* ptr) {
+  int v;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   v = LOAD(ptr);
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+      : "=r"(v) : "l"(ptr));
+#endif
+  return v;
 }
 
 typedef uint64_t PackType;
@@ -485,16 +491,16 @@ struct MULTI128 {
 
 inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  v.x = p->x;
-  v.y = p->y;
+  v.x = __builtin_nontemporal_load(&p->x);
+  v.y = __builtin_nontemporal_load(&p->y);
 #else
   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
 #endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  p->x = v.x;
-  p->y = v.y;
+  __builtin_nontemporal_store(v.x, &p->x);
+  __builtin_nontemporal_store(v.y, &p->y);
 #else
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
 #endif
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index 54fd993ea3..af1d56fc26 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -17,11 +17,11 @@ namespace {
     int tid = threadIdx.x;
     int tn = blockDim.x;
     #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) {
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
       ncclWorkElem *we = &w->elems[e];
-      intptr_t eltN = we->coll.count;
-      int bid = we->coll.bid;
-      int bn = we->coll.nChannels;
+      intptr_t eltN = we->count;
+      int bid = we->bid;
+      int bn = we->nChannels;
       T const *src = (T const*)we->sendbuff;
       T *dst = (T*)we->recvbuff;
 
@@ -37,13 +37,13 @@ namespace {
       src += i0;
       dst += i0;
       ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
-        (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0);
+        (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
     }
   }
 }
 
 #define INSTANTIATE(devredop, type) \
-  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)(struct ncclWorkElem* args) { \
+  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
     oneRankReduce<type, Func##devredop<type>>(); \
   }
 
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 627407ddd0..ee8f06a569 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -125,7 +125,7 @@ struct FanSymmetric {
 };
 
 // The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
 class Primitives;
 
 // Used by LL & LL128 to implement direct members in the naive way.
@@ -157,20 +157,12 @@ struct PrimitivesWithoutDirect {
 #include "prims_ll128.h"
 
 #ifdef ENABLE_PROFILING
-#ifdef ENABLE_TIMING_PROFILE
 #define INIT_COUNTER \
-  if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); }
+  if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; }
 #define ACCUMULATE_COUNTER(prim) \
-  if (tid == 0 && args->coll.opCount) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0); \
-    devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); }
-#else
-#define INIT_COUNTER \
-  if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = devProf->elems[blockIdx.x].wait_cycle; }
-#define ACCUMULATE_COUNTER(prim) \
-  if (tid == 0 && args->coll.opCount) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
-    + ws - devProf->elems[blockIdx.x].wait_cycle); \
-    devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); }
-#endif
+  if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
+    + ws - elem->elem[blockIdx.x].wait_cycle); \
+    elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;}
 #else
 #define INIT_COUNTER
 #define ACCUMULATE_COUNTER(prim)
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index d6e76d6985..7e0ca7211b 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -1,13 +1,13 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -45,7 +45,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     __syncthreads();
 #else
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
 #endif
   }
 
@@ -123,7 +123,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
 
   template<int BeginIx>
   __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
-    #pragma unroll 1
+    #pragma unroll
     for (int i=BeginIx; i < MaxRecv; i++) {
       if (i < fan.nrecv()) {
         union ncclLLFifoLine* src = recvPtr(i) + offset;
@@ -290,14 +290,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
 
     // Always waitSend in case of cleanup
     nelem = nelem < 0 ? 0 : nelem;
-#ifdef ENABLE_PROFILING
-    uint64_t t0;
-    if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
-#endif
     if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine));
-#ifdef ENABLE_PROFILING
-    if (SEND && tid == 0) ncclShmem->comm.devProf->elems[blockIdx.x].wait_cycle = (__builtin_amdgcn_s_memrealtime() - t0);
-#endif
 
     nelem -= tid*EltPerLine;
     srcElts += tid*EltPerLine;
@@ -324,7 +317,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
       }
       if (RECV) {
         data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
-        #pragma unroll 1
+        #pragma unroll MaxRecv
         for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
           peerData = readLLFinish(offset, line, i);
           data = MULTI<RedOp,T>()(redOp, peerData, data);
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 4dc0c21754..972ce9d091 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,9 +11,9 @@
 
 #define __any_sync(WARP_MASK, needReload) (true)
 
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -52,7 +52,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
   inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
 
   inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    __syncthreads();
+#else
+   asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+#endif
   }
 
   uint32_t abort = 0;
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index 7160cc44ff..a107a1ce64 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -1,14 +1,14 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -19,7 +19,7 @@ class Primitives<
                        RolePostSend = 0x10,
                        RolePostRecv = 0x20,
                        Aborted = 0x40,
-                       PtrsFifoEnabled = 0x80,
+                       OffsFifoEnabled = 0x80,
                        SizesFifoEnabled = 0x100,
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
@@ -33,10 +33,10 @@ class Primitives<
   int flags;
   int group;
   uint64_t step;
+  int *connOffsFifoPtr;   // (flags & OffsFifoEnabled)
   union {
-    void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled)
     T *userBuff;            // (flags & (RoleInput|RoleOutput))
-    T *connEltsFifo;        // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput))
+    T *connEltsFifo;        // !(flags & (RoleInput|RoleOutput))
   };
   union {
     int volatile *connSizesFifoPtr; //  (flags & SizesFifoEnabled)
@@ -59,7 +59,7 @@ class Primitives<
     if (nthreads == WARP_SIZE)
       __syncwarp();
     else
-      asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
+      asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
 #endif
     flags |= ThreadsSynced;
   }
@@ -70,7 +70,7 @@ class Primitives<
     if (nworkers == nthreads)
       barrier();
     else
-      asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
+      asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
 #endif
   }
 
@@ -88,9 +88,6 @@ class Primitives<
     const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
     const bool noRecvWait = DirectRecv && Src && (flags & DirectRead);        // no wait when directly reading from remote input
     const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
-#if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE)
-    uint64_t t0 = __builtin_amdgcn_s_memrealtime();
-#endif
     if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
         ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
       int spins = 0;
@@ -109,8 +106,8 @@ class Primitives<
 
       void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst)
                                   : (ncclShmem->groups[group].srcs + Src);
-      if (flags & PtrsFifoEnabled)
-        loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]);
+      if (flags & OffsFifoEnabled)
+        ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
       else if (isSendNotRecv && DirectSend) {
         if (flags & DirectWrite) {
           ptrs[index] = directBuff + remoteIx + offset;
@@ -132,14 +129,6 @@ class Primitives<
         ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
       }
       step += StepPerSlice;
-#if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE)
-      if (opCount) {
-        if (isSendNotRecv)
-          ncclShmem->comm.devProf->elems[blockIdx.x].wait_send_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
-        else
-          ncclShmem->comm.devProf->elems[blockIdx.x].wait_recv_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
-      }
-#endif
     }
   }
 
@@ -204,7 +193,10 @@ class Primitives<
         waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
         subBarrier();
 #ifdef ENABLE_PROFILING
-        if (tid == 0 && opCount) ncclShmem->comm.devProf->elems[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
+        if (tid == 0) {
+          struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS;
+          elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
+        }
 #endif
         if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
@@ -262,6 +254,8 @@ class Primitives<
   }
 
   // Scatter/Gather generic op
+  // skip: my own rank order in the buffer chunks
+  // shift: peer offset to avoid all ranks sending to or receiving from same peer
   template <int DirectRecv1, int DirectSend1, int Recv, int Send>
   __device__ __forceinline__ void
   ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
@@ -287,11 +281,13 @@ class Primitives<
           for (int j=0; j<fan.nsend(); j++) {
             int i = (j+shift)%fan.nsend();
             int peerOffset = i*peerElem;
+            // Skip the data I am responsible of reducing myself
             if (skip >= 0 && i >= skip) peerOffset += peerElem;
             const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset;
             int realPeerSize = min(realSize, totalElem-peerOffset);
             if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) {
               ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize);
+              // Mark for threadfence at the end
               if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize;
             }
           }
@@ -319,6 +315,7 @@ class Primitives<
         }
       }
       barrier();
+      // If we indeed send something, threadfence
       if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0)
         __threadfence_system();
       __syncwarp();
@@ -340,18 +337,18 @@ class Primitives<
         ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->tail;
         connStepCache = LOAD(connStepPtr);
-        flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
         if (Direct) {
           // User buffers have been registered
           if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                        (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
             }
           } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               // direct read not allowed in non-register case
@@ -360,10 +357,9 @@ class Primitives<
             }
           }
         }
-        if (flags & PtrsFifoEnabled)
-          connPtrsFifoPtr = conn->ptrsFifo;
-        else
-          connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
       }
     }
   }
@@ -380,11 +376,10 @@ class Primitives<
         ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->head;
         connStepCache = LOAD(connStepPtr);
-        flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
-        if (flags & PtrsFifoEnabled)
-          connPtrsFifoPtr = conn->ptrsFifo;
-        else
-          connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
 
         if (conn->sizesFifo != nullptr) {
           flags |= SizesFifoEnabled;
@@ -392,14 +387,14 @@ class Primitives<
         } else if (Direct) {
           // User buffers have been registered
           if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                        (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
             }
           } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               // direct read not allowed in non-register case
@@ -419,7 +414,7 @@ class Primitives<
     ):
     tid(tid),
     stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)),
-    opCount(ncclShmem->work.elems[0].coll.opCount) {
+    opCount(ncclShmem->work.elems[0].opCount) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
     this->nthreads = nthreads;
@@ -460,7 +455,7 @@ class Primitives<
     loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
     loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
 
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e);
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
 
   __device__ ~Primitives() {
@@ -477,7 +472,7 @@ class Primitives<
     barrier();
   }
 
-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) {
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
     if (flags & RoleInput) {
       userBuff = (T*)inputBuf;
       ncclShmem->redOpArgs[0] = redOpArg;  // scaler for local input
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 0bf2a98f2a..54355926a5 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,21 +13,21 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem->channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const int nranks = ncclShmem->comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
     const int rank = ncclShmem->comm.rank;
     const int prevRank = ring->devUserRanks[nranks-1];
-    const int root = args->coll.root;
+    const int root = args->root;
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
 
     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
       int realChunkSize;
@@ -36,7 +36,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
       return realChunkSize;
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index a5ee6aefa5..9639372b79 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,9 +13,9 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem->channel.ring;
     int const *ringRanks = ring->devUserRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
@@ -23,10 +23,10 @@ namespace {
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
     const int nranks = ncclShmem->comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -35,7 +35,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
       realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index 698bccde53..15be552009 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,73 +11,67 @@
 
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWork *work) {
-    int tid = threadIdx.x;
-    int group = 0;
-    const int rank = ncclShmem->comm.rank;
-    const int nRanks = ncclShmem->comm.nRanks;
-    using Proto = ProtoSimple<1, 1>;
-
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
-      ncclWorkElem *args = &work->elems[s];
-      int nThreadsSegment = args->p2p.nThreads;
-      if (args->active == 0 || nThreadsSegment == 0) break;
-
-      int nThreadsSplit = nThreadsSegment/2;
-      int groupRecv = group;
-      group += Proto::calcGroupWidth(/*send=*/false, nThreadsSplit);
-      int groupSend = group;
-      group += Proto::calcGroupWidth(/*send=*/true, nThreadsSegment - nThreadsSplit);
-
-      if (tid < nThreadsSegment) {
-        // Compute pointers
-        T const* sendbuff = (const T*)args->sendbuff;
-        T* recvbuff = (T*)args->recvbuff;
-        ssize_t const sendCount = args->p2p.sendCount;
-        ssize_t const recvCount = args->p2p.recvCount;
-        int const delta = args->p2p.delta;
-
-        if (delta == 0) {
-          if (sendbuff != recvbuff) {
-            ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount);
-          }
-        }
-        else {
-          if ((tid < nThreadsSplit) && recvCount >= 0) {
-            int const peer = (rank - delta + nRanks)%nRanks;
-            int const t0 = 0;
-            int const nt = nThreadsSplit;
-            int const chunkSize = args->p2p.recvChunkSize/sizeof(T);
-            Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto> prims
-              (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv | (args->p2p.recvIdx << 16));
-            ssize_t offset = 0;
-            do {
-              int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
-              nelem = min(chunkSize, recvCount-offset);
-              prims.directRecv(offset, nelem);
-              offset += nelem;
-            } while(offset < recvCount);
-          }
-
-          if ((tid >= nThreadsSplit) && sendCount >= 0) {
-            int const peer = (rank + delta)%nRanks;
-            int const t0 = nThreadsSplit;
-            int const nt = nThreadsSegment - nThreadsSplit;
-            int const chunkSize = args->p2p.sendChunkSize/sizeof(T);
-            Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto> prims
-              (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend | (args->p2p.sendIdx << 16));
-            ssize_t offset = 0;
-            do {
-              int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
-              nelem = min(chunkSize, sendCount-offset);
-              prims.directSend(offset, offset, nelem);
-              offset += nelem;
-            } while(offset < sendCount);
-          }
-        }
-        break;
+  __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    if (args->peer == ncclShmem->comm.rank) {
+      struct ncclWorkElemP2p* recvArgs = args-1;
+      if (args->buff != recvArgs->buff) {
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
       }
-      tid -= nThreadsSegment;
+    } else {
+      using Proto = ProtoSimple<1, 1>;
+      ssize_t const count = args->count;
+      int const chunkSize = args->chunkSize/sizeof(T);
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
+        (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
+      ssize_t offset = 0;
+      do {
+        int nelem = min(chunkSize, count-offset);
+        prims.directSend(offset, offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+    }
+  }
+
+  __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    if (args->peer != ncclShmem->comm.rank) {
+      using Proto = ProtoSimple<1, 1>;
+      ssize_t const count = args->count;
+      int const chunkSize = args->chunkSize/sizeof(T);
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
+        (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
+      ssize_t offset = 0;
+      do {
+        int nelem = min(chunkSize, count-offset);
+        prims.directRecv(offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+    }
+  }
+
+  __device__ __forceinline__ void run(ncclWork *work) {
+    struct ncclWorkElemP2p* args = work->p2pElems;
+    int ngroups = args->ngroups;
+    int tid = threadIdx.x;
+    int wid = tid / WARP_SIZE;
+    // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
+    // warps for send, 2 warps for recv).
+    // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
+    // So we mirror wid then mirror again the group.
+    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
+    int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
+    args += group;
+    if (args->header.type == ncclWorkTypeUnused) return;
+
+    tid -= args->warpStart * WARP_SIZE;
+    int nthreads = args->nWarps * WARP_SIZE;
+    group |= (args->connIndex<<16); // Used to select connIndex 1
+    if (tid >= nthreads || args->peer == -1) return;
+    if ((group%2) == 0) {
+      runRecv(tid, nthreads, group, args);
+    } else {
+      runSend(tid, nthreads, group, args);
     }
   }
 };
diff --git a/src/collectives/sendrecv_api.cc b/src/collectives/sendrecv_api.cc
index 296144dff1..b137d22526 100644
--- a/src/collectives/sendrecv_api.cc
+++ b/src/collectives/sendrecv_api.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -14,8 +14,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, hipStream_t stream) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  struct ncclInfo info = { ncclFuncSendRecv, "Send",
-    sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
   NCCLCHECK(ncclGroupStart());
@@ -29,7 +29,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, hipStream_t stream) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  struct ncclInfo info = { ncclFuncSendRecv, "Recv",
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
diff --git a/src/debug.cc b/src/debug.cc
index 321d8081e3..fbf51524e0 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -168,3 +168,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   }
   pthread_mutex_unlock(&ncclDebugLock);
 }
+
+NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
+  // pthread_setname_np is nonstandard GNU extension
+  // needs the following feature test macro
+#ifdef _GNU_SOURCE
+  if (ncclParamSetThreadName() != 1) return;
+  char threadName[NCCL_THREAD_NAMELEN];
+  va_list vargs;
+  va_start(vargs, fmt);
+  vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
+  va_end(vargs);
+  pthread_setname_np(thread, threadName);
+#endif
+}
diff --git a/src/enhcompat.cc b/src/enhcompat.cc
new file mode 100644
index 0000000000..97f5a3fb26
--- /dev/null
+++ b/src/enhcompat.cc
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
+
+enum cudaError_t { cudaErrorStubLibrary = 34 };
+
+extern "C" {
+
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
+
+cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
+
+cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
+
+}
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 100e05a52e..3b4ccb6ecf 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -68,7 +68,7 @@
   NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \
   NCCL_FUNCS3B(func, Sum)  /*SumPostDiv*/
 
-typedef void(*ncclKern_t)(struct ncclWorkElem first);
+typedef void(*ncclKern_t)(struct ncclDevComm* comm, struct ncclWorkElem first);
 // Must be consistent with the ncclFuncSet enum
 static ncclKern_t const ncclKerns[1] = {
   NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
@@ -89,6 +89,19 @@ error:
   return (res != ncclSuccess) ? 0 : max;
 }
 
+// Set shared memory carveout for the nccl kernels
+ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
+  ncclResult_t res = ncclSuccess;
+  int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
+  for (int i = 0; i < numNcclKerns; i++) {
+    CUDACHECKGOTO(hipFuncSetAttribute((const void *)ncclKerns[i], hipFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
+  }
+
+error:
+  return res;
+}
+
+
 /*****************************************************************************/
 /*       Launch system : synchronization and CUDA kernel launch              */
 /*****************************************************************************/
@@ -118,21 +131,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
   }
   int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
   struct ncclWork* w = channel->workFifo+opIndex;
-  struct ncclWorkElem* e = w->elems;
-  volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
-  while (activePtr[0] != 0) sched_yield();
+  volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type;
+  while (typePtr[0] != ncclWorkTypeUnused) sched_yield();
   memset(w, 0, sizeof(struct ncclWork));
   // Initialize with work elem if provided
-  if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
-  e->active = 1;
+  if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem));
   channel->workFifoTail++;
   channel->workCount++;
   if (work) *work = w;
   return ncclSuccess;
 }
 
+// Finalize channel work FIFO states before launch
+// Called during dynamic enqueue
 static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
   ncclComm_t comm = eqInfo->comm;
+  // Do not use comm->myParams in this function unless in non-graph mode
+  // In graph mode, enqueue is async to capture, myParams can have been changed
   hipLaunchParams* params = comm->myParams;
 
   // Only launch blocks where we have work to do.
@@ -147,26 +162,24 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
     eqInfo->maxChannels = params->gridDim.x;
   }
 
-  // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
+  // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case).
   for (int c=0; c<eqInfo->maxChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     if (channel->workCount == 0) {
       struct ncclWork* w;
       NCCLCHECK(getNextOp(channel, &w, NULL));
-      struct ncclWorkElem* e = w->elems;
-      e->comm = comm->devComm;
-      e->funcIndex = FUNC_INDEX_P2P;
-      e->p2p.nThreads = 0;
+      w->header.funcIndex = FUNC_INDEX_P2P;
+      w->header.type = ncclWorkTypeP2p;
+      w->header.nWarps = 0;
     }
-    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2;
+    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1;
 
     if (c == 0) {
       // As we inline the first coll directly, we can free it immediately.
       // Except P2P or aggregation or registration cases
       struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
-      struct ncclWorkElem* elem = work->elems;
-      if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0)
-        elem->active = 0;
+      if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1)
+        work->header.type = ncclWorkTypeUnused;
     }
 
     if (channel->gdrMemDesc) {
@@ -226,6 +239,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+// Check dependency wrt outside streams or previous launches
+// Launch kernel in GROUP mode
 ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
   hipLaunchParams* params = comm->myParams;
   if (params->gridDim.x == 0) return ncclSuccess;
@@ -261,6 +276,7 @@ ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+// Launch kernel in PARALLEL mode
 ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
   hipLaunchParams *params = comm->myParams;
   if (params->gridDim.x == 0) return ncclSuccess;
@@ -283,6 +299,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Launch network proxy
 static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
   // Start the network proxies as soon as the kernel has been launched. We can't
   // perform any CUDA call between the two or having a cudaFree between the CUDA
@@ -302,6 +319,7 @@ static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
   return ncclSuccess;
 }
 
+// Record done event for current launch
 ncclResult_t ncclRecordEvents(ncclComm_t comm) {
   hipLaunchParams *params = comm->myParams;
 
@@ -320,6 +338,7 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Reset parameter space for launch
 ncclResult_t ncclLaunchReset(ncclComm_t comm) {
   comm->userStreamSet = false;
 
@@ -333,6 +352,8 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
     NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
   }
 
+  // After capturing an op in graph mode or launching the op in non-graph mode
+  // we can reset myParams for use in next op
   hipLaunchParams *params = comm->myParams;
   params->gridDim.x = params->blockDim.x = 0;
   params->func = NULL;
@@ -347,10 +368,10 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
 /*****************************************************************************/
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
-RCCL_PARAM(SharpThreshold, "SHARP_THRESHOLD", 16384);
 
 static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
-  if (info->comm->collNetSupport > 0 && info->nBytes < rcclParamSharpThreshold()) {
+  if (info->comm->collNetSupport > 0) {
+    // Translate ncclAvg and PreMulSum
     ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
     NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
   } else {
@@ -359,6 +380,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
   return ncclSuccess;
 }
 
+// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
 static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
   struct ncclComm* comm = info->comm;
   if (comm->nRanks == 1 || info->coll == ncclFuncAllToAllPivot) {
@@ -395,6 +417,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
   int nt = comm->maxThreads[info->algorithm][info->protocol];
   int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
   if (info->algorithm == NCCL_ALGO_COLLNET) {
+    // CollNet channel tuning
     int ncSwitch = 16;
     bool flag = true;
     while (ncSwitch >= 1 && flag) {
@@ -405,6 +428,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
       ncSwitch /= 2;
     }
   } else {
+    // Ring/Tree channel tuning
     while (info->nBytes < nc*nt*threadThreshold) {
       if (nc >= 2) nc--;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -419,6 +443,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
 #else
   if (info->protocol == NCCL_PROTO_SIMPLE) {
     nt += WARP_SIZE; // Extra warp for sync
+    // More threads or sync warps needed due to split thread model
     if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
     if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
   }
@@ -495,11 +520,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
 
 RCCL_PARAM(IntraNetThreshold, "INTRANET_THRESHOLD", 8388608);
 
-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
-  work->comm = info->comm->devComm;
-
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
   int collNetTypeSupport = 0;
-  // Check whether algo and proto have been preset
+  // Check whether algo and proto have been preset (as in aggregation case)
+  // If so, skip the calculation
   if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
   NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
   NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
@@ -509,59 +533,37 @@ comp_next:
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
-  work->coll.opCount = info->comm->collOpCount;
+  work->opCount = info->opCount;
+  work->header.type = ncclWorkTypeColl;
   work->sendbuff = info->sendbuff;
   work->recvbuff = info->recvbuff;
-  work->coll.root = info->root;
-  work->coll.count = info->count;
-  work->coll.nChannels = info->nChannels;
-  work->nThreads = info->nThreads;
-  work->coll.redOpArg = info->opFull.scalarArg;
+  work->root = info->root;
+  work->count = info->count;
+  work->nChannels = info->nChannels;
+  work->header.nWarps = info->nThreads / info->comm->WarpSize;
+  work->redOpArg = info->opFull.scalarArg;
   work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
 
   if (info->comm->nRanks == 1) {
     // one-rank reduce index
-    work->funcIndex = FUNC_INDEX_P2P - ncclNumTypes + int(info->datatype);
+    work->header.funcIndex = FUNC_INDEX_P2P - ncclNumTypes + int(info->datatype);
+    //work->header.funcIndex = 1 + int(info->datatype);
     return ncclSuccess;
   } else if (info->coll == ncclFuncAllToAllPivot) {
-    work->funcIndex = FUNC_INDEX_ALLTOALL_PIVOT;
+    work->header.funcIndex = FUNC_INDEX_ALLTOALL_PIVOT;
   } else {
-    work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+    work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
   }
 
-  work->coll.connIndex = 0;
-  proxyArgs->connIndex = 0;
+  work->connIndex = 0;
+  proxyOp->connIndex = 0;
   if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) {
     if (info->comm->useIntraNet && info->nBytes > rcclParamIntraNetThreshold()) {
-      work->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
-      proxyArgs->connIndex = NCCL_CONN_IDX_P2P_NET;
+      work->connIndex = NCCL_CONN_IDX_P2P_NET;
+      proxyOp->connIndex = NCCL_CONN_IDX_P2P_NET;
     }
   }
 
-  { // [RCCL] Check for clique-based kernel support
-    if (info->comm->cliqueManager->IsSupported(info->coll,
-                                               info->count,
-                                               info->datatype,
-                                               info->op))
-    {
-      info->algorithm = NCCL_ALGO_RING;
-      info->protocol = NCCL_PROTO_CLIQUE;
-      // Determine the number of channels to use for clique-kernel
-      NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
-                                                               info->count,
-                                                               info->datatype,
-                                                               info->op,
-                                                               info->comm->nChannels,
-                                                               &work->clique.nChannels));
-      work->clique.count = info->count;
-      work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
-
-      // Setup pointers to where all the input/output pointers will be
-      NCCLCHECK(info->comm->cliqueManager->WaitForPointers(work));
-      return ncclSuccess;
-    }
-  } // [RCCL]
-
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
   int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -576,22 +578,22 @@ comp_next:
       while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
     }
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
     // Optimize chunkSize / nSteps
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
     // Set direct direction for broadcast-gather (read or write)
     work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
-    work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
+    work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t));
+    work->lastChunkSize /= ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
     int nNodes = info->comm->nNodes;
     float ppn = info->comm->nRanks / (float)nNodes;
@@ -599,7 +601,7 @@ comp_next:
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+    work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
   }
 
   // Compute nSteps for proxies
@@ -608,29 +610,29 @@ comp_next:
   if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
   //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
   int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
-  proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
-  proxyArgs->sliceSteps = sliceSteps;
-  proxyArgs->chunkSteps = chunkSteps;
-  proxyArgs->chunkSize = chunkSize;
-  proxyArgs->protocol = info->protocol;
-  proxyArgs->dtype = info->datatype;
-  proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
+  proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyOp->sliceSteps = sliceSteps;
+  proxyOp->chunkSteps = chunkSteps;
+  proxyOp->chunkSize = chunkSize;
+  proxyOp->protocol = info->protocol;
+  proxyOp->dtype = info->datatype;
+  proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
                      info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
                      info->op;
-  proxyArgs->pattern = info->pattern;
-  proxyArgs->root = info->root;
+  proxyOp->pattern = info->pattern;
+  proxyOp->root = info->root;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
   // because some protocols need to transmit more than the total size, plus they sometimes
   // round up
-  proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps;
+  proxyOp->nbytes = stepSize*proxyOp->sliceSteps;
 
   TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
-      proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
-      nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm);
+      proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
+      nLoops, proxyOp->nsteps, chunkSize, info->comm);
 
   // For Pivot A2A, lastChunkSize is not needed, set pivotA2ANumBiRings instead
   if (info->coll == ncclFuncAllToAllPivot) {
-    work->coll.pivotA2ANumBiRings = info->comm->topo->pivotA2ANumBiRings;
+    work->pivotA2ANumBiRings = info->comm->topo->pivotA2ANumBiRings;
   }
 
   return ncclSuccess;
@@ -647,6 +649,7 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
+// Handle structure for user buffer registration (IPC) exchange
 struct ncclBuffRegHandle {
   hipIpcMemHandle_t sendBuffIpc;
   hipIpcMemHandle_t recvBuffIpc;
@@ -661,37 +664,48 @@ static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuf
   if (comm->localRanks == 1) return ncclSuccess;
   if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess;  // CUDA toolkit or driver version too old
 
-  struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS];
+  ncclResult_t ret = ncclSuccess;
+  struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS];
   // Get IPC handles
   // Note: the handle only corresponds to the base address of the allocation
-  CUDACHECK(hipIpcGetMemHandle(&regHandles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff));
-  CUDACHECK(hipIpcGetMemHandle(&regHandles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff));
+  CUDACHECKGOTO(hipIpcGetMemHandle(&regHandles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback);
+  CUDACHECKGOTO(hipIpcGetMemHandle(&regHandles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback);
   // Get offset of user buffer within allocation
   void* baseAddr;
   size_t size;
+  // Get base address
   CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
-  regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
+  regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
   CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
-  regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
-  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset);
+  regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
+  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset);
 
   // Exchange handles within node
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
   // Open handles at local process
   for (int i=0; i<comm->localRanks; i++) {
-    if (i == comm->intraNodeRank) {
+    // Skip myself
+    if (i == comm->localRank) {
       regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
       continue;
     }
+    // Get base address of mapping
     CUDACHECK(hipIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, hipIpcMemLazyEnablePeerAccess));
     CUDACHECK(hipIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, hipIpcMemLazyEnablePeerAccess));
-    // Get real address of buffer
+    // Get real buffer address by adding offset in the mapping
     regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
     regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
   }
+  // Marks the operation as being buffer registered
   regInfo->nBuffs = comm->localRanks;
   TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
   return ncclSuccess;
+
+reg_fallback:
+  // If we cannot register specific buffer types, we just bypass this stage, and continue without failing
+  (void)ret;
+  WARN("Unable to register user buffers");
+  return ncclSuccess;
 }
 
 // Compute enqueue element, save it in list
@@ -710,9 +724,8 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
   // Compute cuda kernel arg and proxy arg templates
   struct ncclQueueElem* eqElem;
   NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
-  struct ncclWorkElem* work = &eqElem->work;
-  eqElem->proxyArgs.nsubs = 1;
-  NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs));
+  struct ncclWork* work = &eqElem->work;
+  NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp));
 
   // Determine grid size
   hipLaunchParams* params = comm->myParams;
@@ -724,9 +737,13 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
   // Inline the first kernel
   if (params->func == NULL) {
     params->func = (void *)ncclKerns[0];
-    memcpy(&comm->args, work, sizeof(struct ncclWorkElem));
-    comm->args.coll.bid = 0;  // Only inline for channel 0
-    comm->args.active = 2;    // I am so far the last element; may be changed later in aggregation mode
+    if (work->header.type == ncclWorkTypeColl) {
+      // Copy the first operation to the inline argument. Type may be set later to
+      // ncclWorkTypeUnused if we have more than one coll element.
+      memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem));
+      comm->args.bid = 0;    // Only inline for channel 0
+      comm->args.header.isLast = 1; // I am so far the last element
+    }
   }
 
   // Register and exchange input and output buffers
@@ -736,15 +753,17 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
       comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
       comm->intraRanks == 1) {                  // only in multi-process mode
     NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
-    // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
-    // because the registered addresses are in ncclWork
-    if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0;
     comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
+    work->header.type = ncclWorkTypeRegColl;
+    // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
+    // because the registered addresses are in ncclWorkElemReg
+    comm->args.header.type = ncclWorkTypeUnused;
   }
 
   return ncclSuccess;
 }
 
+// Find the channel with the least enqueued work (counted in bytes)
 static inline int findShortestChannel(ncclComm_t comm) {
   size_t minSize = SIZE_MAX;
   int minC = 0;
@@ -758,6 +777,7 @@ static inline int findShortestChannel(ncclComm_t comm) {
   return minC;
 }
 
+// Get next channel based on shortest-queue mode or round-robin mode
 static inline int getNextChannel(ncclComm_t comm, int aggMode) {
   int nextChannel = 0;
   if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
@@ -769,6 +789,8 @@ static inline int getNextChannel(ncclComm_t comm, int aggMode) {
   return nextChannel;
 }
 
+// Setup aggregated kernels
+// Op info has been previously saved in comm->asyncOps
 ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
   if (comm->asyncOpCount == 0) {
     return ncclSuccess;
@@ -779,16 +801,22 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
     NCCLCHECK(ncclSetupCollKernel(info));
   } else {
     // Aggregation
+    // Determine a per-channel chunk size used to divide an operation into multiple channels
     size_t channelSize;
     if (comm->channelSize > 0) {
+      // Set by user
       channelSize = comm->channelSize;
     } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) {
+      // CollNet specific size (tuned based on experiments)
       channelSize = 256 * 1024;
     } else {
-      channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);  // scale channel size based on nranks as latency increases
+      // Latency increases as scale increases
+      // We would thus want to increase the chunk size to compensate for the lost efficiency
+      channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
     }
     // Reduce the per-channel size if we cannot fully utilize the channels
     while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
+    // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork)
     int channelUsed = 0;
     int homogeneous = 1;
     int allCollNetSupport = comm->collNetSupport;
@@ -801,6 +829,8 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
         info->nChannels = std::min(std::max(1, (int)DIVUP(info->nBytes, channelSize)), comm->nChannels); // assign number of channels
       }
       channelUsed += info->nChannels;
+      //printf("asyncOpCount %d nChannels %d used %d info->nBytes %ld channelSize %ld comm->nChannels %d\n",
+        //c, info->nChannels, channelUsed, info->nBytes, channelSize, comm->nChannels);
       // We can use fast path if all collectives are the same
       homogeneous &= info->coll == comm->asyncOps[0].coll &&
                      info->opFull.op == comm->asyncOps[0].opFull.op &&
@@ -808,6 +838,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
       if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
     }
     // Compute algo, proto, nthreads for the entire kernel
+    // Prepare a synthetic op info to calculate the collective algo
     struct ncclInfo total;
     total.comm = comm;
     total.coll = comm->asyncOps[0].coll;
@@ -815,16 +846,18 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
     total.nChannels = std::min(channelUsed, comm->nChannels);
     int perChannelOps = DIVUP(channelUsed, total.nChannels);
     if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps));
+    // Set for each op
     for (int c = 0; c < comm->asyncOpCount; c++) {
       struct ncclInfo* info = comm->asyncOps+c;
       if (homogeneous) {
+        // Set fields to skip the individual computeColl in ncclSetupCollKernel
         info->algorithm = total.algorithm;
         info->protocol = total.protocol;
         info->nThreads = total.nThreads;
       }
       NCCLCHECK(ncclSetupCollKernel(info));
     }
-    comm->args.active = 0;  // disable inline argument
+    comm->args.header.type = ncclWorkTypeUnused;  // disable inline argument
   }
   // Reset counters
   comm->asyncOpCount = 0;
@@ -832,6 +865,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Store aggregated operations info
 static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
   ncclComm_t comm = info->comm;
   if (comm->asyncOpCount >= NCCL_MAX_OPS) {
@@ -850,14 +884,19 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
   int peer = info->root;
   ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  if (info->opName[0] == 'S') { // Send
+  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+  int peerNode = comm->rankToNode[peer];
+  int peerIndex = comm->rankToLocalRank[peer];
+  int nsteps = comm->maxLocalRanks;
+  int rankIndex = comm->rankToLocalRank[comm->rank];
+  if (info->coll == ncclFuncSend) {
     if (peer != comm->rank) {
       int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[0].connected == 0) {
-          comm->connectSend[peer] |= (1<<channelId);
-          comm->connect[0] = 1;
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
+          comm->connectSend[peer+comm->nRanks*1] |= (1<<channelId);
+          comm->connect[1] = 1;
         }
         if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
           comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
@@ -865,16 +904,16 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
         }
       }
     }
-    NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], (void*)info->sendbuff, nBytes));
+    NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes, info->opCount));
     comm->p2pSendCount++;
   } else {
     if (peer != comm->rank) {
       int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) {
-          comm->connectRecv[peer] |= (1<<channelId);
-          comm->connect[0] = 1;
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
+          comm->connectRecv[peer+comm->nRanks*1] |= (1<<channelId);
+          comm->connect[1] = 1;
         }
         if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
           comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
@@ -882,138 +921,160 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
         }
       }
     }
-    NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes));
+    NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes, info->opCount));
     comm->p2pRecvCount++;
   }
   return ncclSuccess;
 }
 
-enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 };
-static int getSegment(int type, int delta, struct ncclWork* work) {
-  // Current ncclWork is full
-  if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1;
+static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work, struct ncclComm* comm) {
+  if (work->header.type && (work->header.type != type)) return -1;
 
-  if (type == P2P_Segment) {  // P2P
-    // Do not mix P2P and collective ops
-    if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1;
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) {
-      if (work->elems[s].active == 0) return s;
+  if (type == ncclWorkTypeP2p) {  // P2P
+    int start = subType == ncclWorkSubTypeRecv ? 0 : 1;
+    for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P && s<NCCL_MAX_NTHREADS/comm->WarpSize; s+=2) {
+      if (work->p2pElems[s].peer == -1) return s;
+      // Do not aggregate multiple sends to the same peer (or receives from the same peer)
+      if (work->p2pElems[s].peer == peer) return -1;
     }
-  } else if (type == CollNet_Segment) { // CollNet
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s+=NCCL_REG_ELEM_FACTOR) {
-      if (work->elems[s].active == 0) return s;
+  } else if (type == ncclWorkTypeRegColl) { // CollNet
+    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) {
+      if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s;
     }
-  } else {  // Ring or Tree
+  } else if (type == ncclWorkTypeColl) {  // Ring or Tree
     for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
-      if (work->elems[s].active == 0) return s;
+      if (work->elems[s].header.type == ncclWorkTypeUnused) return s;
     }
   }
   return -1;
 }
 
-static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) {
-  elem->comm = info->comm->devComm;
-  elem->funcIndex = FUNC_INDEX_P2P;
-  elem->nThreads = NCCL_MAX_NTHREADS;
-  elem->sendbuff = info->sendbuff;
-  elem->recvbuff = info->recvbuff;
-  elem->p2p.opCount = info->comm->p2pOpCount;
-  elem->p2p.sendCount = info->sendbytes;
-  elem->p2p.recvCount = info->recvbytes;
-  elem->p2p.sendChunkSize = info->sendChunkSize;
-  elem->p2p.recvChunkSize = info->recvChunkSize;
-  elem->p2p.delta = info->delta;
+// Compute kernel arguments for P2P ops
+static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) {
+  elem->header.type = ncclWorkTypeP2p;
+  elem->header.funcIndex = FUNC_INDEX_P2P;
+  elem->header.nWarps = NCCL_MAX_NTHREADS/info->comm->WarpSize;
+  elem->buff = info->recvbuff;
+  elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv;
+  elem->count = info->count;
+  elem->chunkSize = info->chunkSize;
+  elem->peer = info->root;
+  elem->opCount = info->opCount;
+  elem->connIndex =  info->connIndex;
   return ncclSuccess;
 }
 
-static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s,
+// Equeue work elements into segment of ncclWork
+// Supporting both collectives (aggregated or not) and P2P
+static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s,
     struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
-  // Copy element into corresponding segment of ncclWork
-  memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
-  work->elems[s].active = 1;
 
-  // Determine nThreads at dynamic time
-  if (type == P2P_Segment) {
-    const int nsegments = s+1;
-    int nThreads = 512;
-    while (nsegments*nThreads > 256) nThreads /= 2;
-    //if (nThreads >= 128) nThreads += WARP_SIZE;
-    for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
+  if (type == ncclWorkTypeP2p) {
+    memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p));
+    if(s) work->header.funcIndex = FUNC_INDEX_P2P;
+    int nelems = 0;
+    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P && i<NCCL_MAX_NTHREADS/comm->WarpSize; i++) {
+      if (work->p2pElems[i].header.type) nelems = i+1;
+    }
+
+    int ngroups = 1;
+    while (ngroups < nelems) ngroups *= 2;
+    int nWarps = 1;
+    while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2;
+
+    for (int i=0; i<ngroups; i++) {
+      work->p2pElems[i].ngroups = ngroups;
+      work->p2pElems[i].warpStart =
+        i*(NCCL_MAX_NTHREADS/comm->WarpSize)/ngroups;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+      work->p2pElems[i].nWarps = nWarps;
+#else
+      int extraWarp = nWarps >= 2 ? i%2 : 0;
+      work->p2pElems[i].nWarps = nWarps + extraWarp;
+#endif
+    }
+    return ncclSuccess;
   }
 
+  memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
+
+  if (regInfo->nBuffs == 0) return ncclSuccess;
+
   // Copy registered buffer addresses into ncclWork
-  if (regInfo->nBuffs > 0) {
-    struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s);
-    // For CollNet
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collTree.down[i];
-      if (peer == -1) break;
-      int j = comm->rankToIntraNodeRank[peer];
-      if (j < 0) {
-        WARN("Invalid intra-node rank %d for peer %d", j, peer);
-        return ncclInternalError;
-      }
-      regElem->dnInputs[i] = regInfo->sendbuffs[j];
-      regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+  struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s);
+  // For CollNet
+  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+    int peer = channel->collTree.down[i];
+    if (peer == -1) break;
+    // Get intra-node slot
+    int j = comm->rankToLocalRank[peer];
+    if (j < 0) {
+      WARN("Invalid intra-node rank %d for peer %d", j, peer);
+      return ncclInternalError;
     }
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collTree.up[i];
-      if (peer == -1) break;
-      int j = comm->rankToIntraNodeRank[peer];
-      if (j < 0) {
-        WARN("Invalid intra-node rank %d for peer %d", j, peer);
-        return ncclInternalError;
-      }
-      regElem->upOutputs[i] = regInfo->recvbuffs[j];
-    }
-    work->elems[s].regUsed = 1;
+    // Input buffer of leaf peer
+    regElem->dnInputs[i] = regInfo->sendbuffs[j];
+    // Output buffer of leaf peer
+    regElem->dnOutputs[i] = regInfo->recvbuffs[j];
   }
+  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+    int peer = channel->collTree.up[i];
+    if (peer == -1) break;
+    int j = comm->rankToLocalRank[peer];
+    if (j < 0) {
+      WARN("Invalid intra-node rank %d for peer %d", j, peer);
+      return ncclInternalError;
+    }
+    // Output buffer of root peer
+    regElem->upOutputs[i] = regInfo->recvbuffs[j];
+  }
+  work->elems[s].regUsed = 1;
   return ncclSuccess;
 }
 
+// Enqueue P2P op
 ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
-  struct ncclWorkElem* workElem = &eqElem->work;
-  struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+  struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems;
+  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
 
   // Try to reuse last p2p operation if not full yet
-  struct ncclChannel* channel = proxyArgs->subs[0].channel;
+  struct ncclChannel* channel = comm->channels+proxyOp->channelId;
   int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
   struct ncclWork* w = channel->workFifo+opIndex;
   int segment = -1;
   if (channel->workCount) {
     // Try to pack more segments into a single operation
-    segment = getSegment(P2P_Segment, workElem->p2p.delta, w);
+    segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w, comm);
   }
   if (segment == -1) {
     NCCLCHECK(getNextOp(channel, &w, NULL));
-    segment = 0;
+    segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1;
+    // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used.
+    w->header.type = ncclWorkTypeP2p;
+    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P && i<NCCL_MAX_NTHREADS/comm->WarpSize; i++) w->p2pElems[i].peer = -1;
   }
+  //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment);
 
   // store work element into FIFO
-  NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
-  NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm));
-  comm->p2pOpCount++;
+  NCCLCHECK(ncclProxySaveP2p(comm, proxyOp));
+  NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm));
   return ncclSuccess;
 }
 
+// Setup P2P op
 ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
   ncclComm* comm = info->comm;
   // Compute cuda kernel arg and proxy arg templates
   struct ncclQueueElem* eqElem;
   NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
   // The proxy code will set and tune the send/recv chunk size, make sure to run it first.
-  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
-  NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
-
-  eqElem->proxyArgs.sendIdx = info->sendIdx;
-  eqElem->proxyArgs.recvIdx = info->recvIdx;
-  eqElem->work.p2p.sendIdx = info->sendIdx;
-  eqElem->work.p2p.recvIdx = info->recvIdx;
-
+  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp));
+  NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems));
+  // Compute grid size
   int channelId = info->channelId;
   hipLaunchParams* params = comm->myParams;
   params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
-  params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads);
+  params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*info->comm->WarpSize);
   comm->enqueueInfo->maxChannels = params->gridDim.x;  // params may be varied by a second graph hence we need to capture it here
 
   // Record the first kernel to launch
@@ -1021,8 +1082,8 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
   // The CUDA kernel does not use the inlined first work element as fastpath argument
   if (params->func == NULL) {
     params->func = (void *)ncclKerns[0];
-    comm->args.comm = eqElem->work.comm;
-    comm->args.active = 0;
+    //params->func = ncclKerns[eqElem->work.header.funcIndex];
+    comm->args.header.type = ncclWorkTypeUnused;
   }
   return ncclSuccess;
 }
@@ -1030,24 +1091,24 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
 // Dynamic enqueue function for collective kernels
 // Supports both aggregated and non-aggregated modes
 ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
-  struct ncclWorkElem* work = &eqElem->work;
-  struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+  struct ncclWork* work = &eqElem->work;
+  struct ncclWorkElem* elem = work->elems;
+  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
 
-  int nChannels = work->coll.nChannels;
-  size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels;
-  int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment;  // redOp is only set when using CollNet
+  int nChannels = elem->nChannels;
+  size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels;
+  enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl;  // redOp is only set when using CollNet
 
   for (int bid=0; bid<nChannels; bid++) {
     int channelId = getNextChannel(comm, aggMode);
     struct ncclChannel* channel = comm->channels+channelId;
 
     // Proxy
-    proxyArgs->subs[0].channel = channel;
-    proxyArgs->opCount = comm->collOpCount;
-    proxyArgs->commOpCount = comm->opCount;
-    if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
+    proxyOp->channelId = channelId;
+    proxyOp->opCount = comm->collOpCount;
+    if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks));
 
-    work->coll.bid = bid % nChannels;
+    elem->bid = bid % nChannels;
     struct ncclWork* w = NULL;
     int segment = -1;
     if (aggMode && channel->workCount) {
@@ -1056,9 +1117,9 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
       w = channel->workFifo+opIndex;
       // All elems in work must have same (funcIndex,nThreads),
       // see "src/collectives/device/common.h"
-      if (w->elems[0].funcIndex == work->funcIndex &&
-          w->elems[0].nThreads == work->nThreads) {
-        segment = getSegment(segmentType, 0, w);
+      if (w->header.funcIndex == work->header.funcIndex &&
+          w->header.nWarps == work->header.nWarps) {
+        segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w, comm);
       }
     }
     if (segment == -1) {
@@ -1067,16 +1128,20 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
     }
 
     // store work element into FIFO
-    NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
+    NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
     channel->totalSize += channelSize;
   }
   comm->collOpCount++;
   return ncclSuccess;
 }
 
+// Host setup node for CUDA Graph
+// Performs the enqueue job
 template<int USING_CUDA_GRAPH>
 void HIPRT_CB ncclEnqueueHostSetup(void* arg) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   ncclResult_t ret;
+  // All work for current launch has been captured in Queue Info
   struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
   ncclComm_t comm = eqInfo->comm;
   int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
@@ -1084,7 +1149,7 @@ void HIPRT_CB ncclEnqueueHostSetup(void* arg) {
   // Iterate through the element list
   struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
   while (eqElem != NULL) {
-    if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
+    if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) {
       NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
     } else {
       NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
@@ -1105,6 +1170,8 @@ cb_end:
 template void HIPRT_CB ncclEnqueueHostSetup<0>(void*);
 template void HIPRT_CB ncclEnqueueHostSetup<1>(void*);
 
+// CUDA Graph helper thread
+// for de-registering user buffers
 void* graphHelperFunc(void *args) {
   struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
   if (res == NULL) {
@@ -1118,8 +1185,10 @@ void* graphHelperFunc(void *args) {
   volatile enum helperThreadState* state = &res->threadState;
   volatile int* ipcTail = &res->ipcTail;
   while (1) {
+    // Last IPC entry enqueue so far
     int ipcTailMark = *ipcTail;
     int ipcCount = 0;
+    // Close IPC till the last entry
     while (res->ipcHead != ipcTailMark) {
       if (res->ipcBases[res->ipcHead] != NULL)
         CUDACHECKIGNORE(hipIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
@@ -1129,6 +1198,7 @@ void* graphHelperFunc(void *args) {
     }
     TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
     pthread_mutex_lock(&res->threadLock);
+    // Check for exit signal
     while (res->ipcHead == *ipcTail && *state != ThreadStop) {
       pthread_cond_wait(&res->threadCond, &res->threadLock);
     }
@@ -1140,23 +1210,24 @@ void* graphHelperFunc(void *args) {
   }
 }
 
+// Check if we are in CUDA Graph capture mode
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
   comm->usingCudaGraph = 0;
+  // Feature requires CUDA 11.3/R465 or above
 #if CUDART_VERSION >= 11030
-  hipStreamCaptureStatus captureStatus;
-  unsigned long long hipGraphId;
+  cudaStreamCaptureStatus captureStatus;
+  unsigned long long cudaGraphId;
+  ncclResult_t ret = ncclSuccess;
   if (comm->driverVersion < 11030) {
-    CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus));
-    if (captureStatus != hipStreamCaptureStatusNone) {
-      WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
-      return ncclInvalidUsage;
-    }
-    return ncclSuccess;
+    // Runtime driver version older than compiler version
+    // Enhanced compat fallback
+    goto enh_compat_end;
   }
-  CUDACHECK(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &hipGraphId, graph, NULL, NULL));
-  if (captureStatus == hipStreamCaptureStatusActive) {
-    if (hipGraphId != comm->lastCudaGraphId) {
-      INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", hipGraphId);
+  // Get CUDA Graph handle
+  CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
+  if (captureStatus == cudaStreamCaptureStatusActive) {
+    if (cudaGraphId != comm->lastCudaGraphId) {
+      INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
       // We are in a new graph, hence need to forget the last setup node so that
       // the first setup node in the new graph will not have a dependency
       comm->lastCudaGraphId = hipGraphId;
@@ -1169,15 +1240,31 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
     // Only create this thread when buffer registration is enabled
     if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
       pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
+      // Init signaling method between Graph destroy function and helper thread
       pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
+      // Set state
       comm->graphHelperResources->threadState = ThreadStart;
+      // Create thread
       pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
+      // Name thread
+      ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev);
     }
   }
+  return ncclSuccess;
+
+enh_compat_end: // Enhanced compat fallback
+  (void)ret;
+  CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
+  if (captureStatus != cudaStreamCaptureStatusNone) {
+    WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+    return ncclInvalidUsage;
+  }
+  // If we are not in capture mode, we can ignore the driver being lower
 #endif
   return ncclSuccess;
 }
 
+// Create host setup node in CUDA Graph
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
 #if CUDART_VERSION >= 11030
   struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
@@ -1185,14 +1272,17 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
   // which CUDA graph would manage lifetime of
   hipUserObject_t object;
   CUDACHECK(hipUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, hipUserObjectNoDestructorSync));
+  // Hand over ownership to CUDA Graph
   CUDACHECK(hipGraphRetainUserObject(graph, object, 1, hipGraphUserObjectMove));
 
   hipHostFn_t fn = ncclEnqueueHostSetup<1>;
   // Add a CPU node to the graph
   hipGraphNode_t setupNode;
+  // Function + parameter space for that function (i.e. enqueue info)
   hipHostNodeParams setupNodeParams = {fn, eqInfo};
   int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
   CUDACHECK(hipGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
+  // Create dependency from last setup node in the same graph
   CUDACHECK(hipStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, hipStreamAddCaptureDependencies));
   comm->lastSetupNode = setupNode;
   return ncclSuccess;
@@ -1271,20 +1361,6 @@ static ncclResult_t hostToDevRedOp(
 }
 
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
-  // [RCCL] Check for clique-based kernel support
-  {
-    if (info->comm->cliqueManager->IsSupported(info->coll,
-                                               info->count,
-                                               info->datatype,
-                                               info->op))
-    {
-      // Declare the input / output pointers being used (to exchange via IPC with other ranks)
-      // This is done immediately, and does not block
-      NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->sendbuff, info->recvbuff));
-    }
-  }
-  // [/RCCL]
-
   ncclResult_t ret = ncclSuccess;
   bool isAsync = ncclAsyncMode();
   int savedDev = -1;
@@ -1300,6 +1376,12 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
   // op handle may be destroyed before ncclGroupEnd().
   NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end);
 
+  // Update opCount
+  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv)
+    info->opCount = info->comm->p2pOpCount++;
+  else
+    info->opCount = info->comm->collOpCount;
+
   // Launch asynchronously if needed
   if (isAsync) {
     // Always register comm even in case of error to make sure ncclGroupEnd
@@ -1308,10 +1390,10 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     NCCLCHECKGOTO(checkSetStream(info), ret, end);
 
     INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p devRedOp %d isPtr %d scaler %lx",
-        info->opName, info->coll == ncclFuncSendRecv ? info->comm->p2pOpCount : info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count,
+        info->opName, info->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream, info->opFull.op, info->opFull.scalarArgIsPtr, info->opFull.scalarArg);
 
-    if (info->coll == ncclFuncSendRecv) { //p2p stored separately
+    if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately
       NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
     } else {
       NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
@@ -1320,7 +1402,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     NCCLCHECKGOTO(checkSetStream(info), ret, end);
 
     INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count,
+        info->opName, info->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
     // Check whether we are in cuda graph mode
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 3ac7d74164..0c3ba56629 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,7 @@
 #include "graph.h"
 #include "trees.h"
 #include "rings.h"
+#include "topo.h"
 
 /******************************************************************/
 /********************* Internode connection ***********************/
@@ -18,7 +19,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
     struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
-  int localRanks = comm->localRanks;
+  int localRanks = comm->topo->nodes[GPU].count;
   int nChannels = comm->nChannels;
 
   for (int c=0; c<nChannels; c++) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 712ac612bb..d2494755f3 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -170,20 +170,21 @@ static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* ret
   return ncclSuccess;
 }
 
-static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
-  struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
+  struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
   struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
 
   int l=0;
   // Node 1 -> CPU
-  for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+  for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
   // CPU -> Node 2
   for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
 
   // Update path characteristics
   srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
-  srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+  srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+  if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
+  srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
   return ncclSuccess;
 }
 
@@ -240,6 +241,8 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
   return ncclSuccess;
 }
 
+NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
+
 int ncclTopoUserP2pLevel = -1;
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
   *p2p = 0;
@@ -255,13 +258,14 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     return ncclSuccess;
   }
 
-
+  int intermediateIndex = -1;
   // Set intermediate GPU rank, if routing through an intermediate GPU.
   struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
   if (path->count == 2) {
     struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
-    if (intermediateNode->type == GPU && intermediateRank) {
-      *intermediateRank = intermediateNode->gpu.rank;
+    if (intermediateNode->type == GPU) {
+      intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+      if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
     }
   }
 
@@ -291,6 +295,41 @@ compare:
   // Compute the PCI distance and compare with the p2pLevel.
   if (path->type <= p2pLevel) *p2p = 1;
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
+  if (*p2p == 1) {
+    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
+    // validate against NVML at all since they are pretending to be on other hw.
+    if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+      int indexes[3] = {-1,-1,-1};
+      int verticeN = 0;
+      NCCLCHECK(ncclNvmlEnsureInitialized());
+
+      indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev;
+      if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev;
+      indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev;
+
+      for (int i=1; i < verticeN; i++) {
+        nvmlGpuP2PStatus_t status;
+        status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead;
+        bool good = status == NVML_P2P_STATUS_OK;
+        status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
+        good &= status == NVML_P2P_STATUS_OK;
+        if (!good) {
+          if (ncclParamIgnoreDisabledP2p()) {
+            *p2p = 0;
+          } else if (path->type <= PATH_NVB) {
+            WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+            return ncclUnhandledCudaError;
+          } else if (path->type < PATH_SYS) {
+            INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+          }
+        }
+      }
+    }
+  }
+#endif
+
   if (path->type == PATH_NVL) {
     struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
     // Enable P2P Read for Ampere/NVLink only
@@ -359,6 +398,14 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   }
 
   int distance = gpu->paths[NET][n].type;
+  if (distance == PATH_PXN) {
+    // In case of PXN, use the intermediate GPU distance instead
+    int proxyRank, g;
+    NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
+    NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
+    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
+    distance = proxyGpu->paths[NET][n].type;
+  }
   if (distance > netGdrLevel) {
     INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
     return ncclSuccess;
@@ -369,6 +416,77 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
+  // Get GPU and NET
+  int n, g;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+  struct ncclTopoLinkList* path = gpu->paths[NET]+n;
+  if (path->type == PATH_PXN) {
+    struct ncclTopoNode* node;
+    int type = NVS;
+    for (int i=0; i<path->count && type == NVS; i++) {
+      node = path->list[i]->remNode;
+      type = node->type;
+    }
+    if (type != GPU) {
+      WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
+      return ncclInternalError;
+    }
+    *intermediateRank = node->gpu.rank;
+  } else {
+    *intermediateRank = rank;
+  }
+  return ncclSuccess;
+}
+
+NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
+
+// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
+// remote proxies without risking deadlocks
+int ncclPxnDisable() {
+  static int pxnDisable = -1;
+  if (pxnDisable == -1) {
+    if (ncclNetVersion() == 4) {
+      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
+      pxnDisable = 1;
+    } else {
+      pxnDisable = ncclParamPxnDisable();
+    }
+  }
+  return pxnDisable;
+}
+
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) {
+  struct ncclTopoSystem* system = comm->topo;
+  *nranks = 0;
+  *intermediateRanks = NULL;
+  if (system->nodes[NET].count == 0) return ncclSuccess;
+
+  int nr = 0;
+  int* ranks = NULL;
+  for (int rank=0; rank<comm->nRanks; rank++) {
+    int netDev, proxyRank;
+    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
+    if (proxyRank == comm->rank) continue;
+    int useGdr;
+    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
+    if (useGdr == 0) continue;
+    int found = 0;
+    for (int r=0; r<nr; r++) {
+      if (ranks[r] == proxyRank) found = 1;
+    }
+    if (!found) {
+      NCCLCHECK(ncclRealloc(&ranks, nr, nr+1));
+      ranks[nr++] = proxyRank;
+    }
+  }
+  *nranks = nr;
+  *intermediateRanks = ranks;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
   // Precompute paths between GPUs/NICs.
 
@@ -393,7 +511,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
         // Divert all traffic through the CPU
         int cpu;
         NCCLCHECK(getLocalCpu(system, g, &cpu));
-        NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+        NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
       }
     }
 
@@ -420,6 +538,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     NCCLCHECK(ncclTopoSetPaths(netNode, system));
 
     for (int g=0; g<system->nodes[GPU].count; g++) {
+      // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+        for (int p=0; p<system->nodes[GPU].count; p++) {
+          if (p == g) continue;
+          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
+
+          // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
+          int netDev;
+          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
+          // Make sure we can allocate memory on that GPU.
+          if (netDev != netNode->id) continue;
+
+          // PXN = PCI + NVLink.
+          if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
+
+          // We can use that GPU as relay to communicate with that NIC.
+          // Only enabling it in the GPU->NIC direction for now to favor
+          // receiving locally and sending remotely (consistent with net.cc)
+          NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
+          break;
+        }
+      }
       // Update path when we dont want to / can't use GPU Direct RDMA.
       int gdr;
       NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
@@ -427,8 +568,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
         // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
         int localCpu;
         NCCLCHECK(getLocalCpu(system, g, &localCpu));
-        NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
-        NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+        NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
+        NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
       }
     }
   }
@@ -499,9 +640,9 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
   if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
     && model == NCCL_TOPO_CPU_TYPE_ROME) {
     int gdr, ret = 1;
-    int64_t net;
+    int net;
     for (int g = 0; g < system->nodes[GPU].count; g++) {
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net, 0));
+      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net));
       NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
       if (!gdr) {
         ret = 0;
@@ -544,6 +685,8 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
   free(system);
 }
 
+NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2);
+
 static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
   int peer;
   struct ncclTopoLinkList* path = NULL;
@@ -563,7 +706,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
     }
   } else {
     // Remote rank, use network
-    *nChannels = 1;
+    *nChannels = ncclParamNChannelsPerNetPeer();
   }
   return ncclSuccess;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 47c0de1009..4a5715d470 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -257,10 +257,10 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
 
 // Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
-#define NCCL_SEARCH_TIMEOUT (1<<18)
-#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
-#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18)
+#define NCCL_SEARCH_TIMEOUT (1<<14)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
 
 #define FORCED_ORDER_PCI 1
 #define FORCED_ORDER_REPLAY 2
@@ -341,6 +341,57 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
   return ncclSuccess;
 }
 
+// Build a list of the best NETs to try.
+//
+// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
+//  index when trying to get back to the NIC.
+//
+// The list is built the following way:
+// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
+// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
+//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
+//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
+// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
+
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+  int netCount = 0;
+  int localNetCount;
+  int* localNets;
+  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
+
+  for (int t=0; t <= typeInter; t++) {
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      if (gpu != -1 && gpu != g) continue;
+      localNetCount = 0;
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      struct ncclTopoLinkList* paths = gpu->paths[NET];
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (paths[n].type == t) localNets[localNetCount++] = n;
+      }
+      if (localNetCount == 0) continue;
+      // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
+      // with multiple NICs don't use the same one as first choice.
+      for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
+        int net0 = localNets[0];
+        for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
+        localNets[localNetCount-1] = net0;
+      }
+      // Append NICs to list
+      for (int i=0; i<localNetCount; i++) {
+        int n = localNets[i];
+        int found = 0;
+        while (nets[found] != n && found<netCount) found++;
+        if (found == netCount) nets[netCount++] = n;
+      }
+    }
+  }
+
+  *netCountRet = netCount;
+  free(localNets);
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
@@ -369,7 +420,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
-      for (int n=0; n<system->nodes[NET].count; n++) {
+      int netcount;
+      int* nets;
+      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
+      for (int i=0; i<netcount; i++) {
+        int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
         if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
         if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
@@ -395,6 +451,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->speedInter = speedInterSave;
         }
       }
+      free(nets);
     }
   } else if (step < system->nodes[GPU].count-1) {
     // Go to next GPU
@@ -429,65 +486,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
-// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
-  float* maxwidths;
-  int* minhops;
-  int netcount = 0;
-  NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
-  NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
-  for (int n=0; n<system->nodes[NET].count; n++) {
-    maxwidths[n] = 0.0;
-    minhops[n] = 255;
-    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-    struct ncclTopoLinkList* paths = net->paths[GPU];
-    for (int g=0; g<system->nodes[GPU].count; g++) {
-      if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
-        maxwidths[n] = paths[g].width;
-        minhops[n] = paths[g].count;
-      }
-    }
-    if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
-    if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
-    int index;
-    for (index = 0; index < netcount; index++) {
-      if (minhops[n] < minhops[nets[index]]) break;
-    }
-    // Insert net at index
-    // Shift all nets with higher nhops
-    for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
-    // Insert this net at index
-    nets[index] = n;
-    netcount++;
-  }
-
-  *netcountRet = netcount;
-
-  // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
-  // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
-  for (int start = 0; start < netcount;) {
-    int end = start+1;
-    while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
-    // Shuffle
-    for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
-      int netStart = nets[start];
-      for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
-      nets[end-1] = netStart;
-    }
-    start = end;
-  }
-
-  free(minhops);
-  free(maxwidths);
-  return ncclSuccess;
-}
-
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
   const int speed = graph->speedInter;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   int netcount;
-  NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
   for (int i=0; i<netcount; i++) {
     int n = nets[i];
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -497,6 +501,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     if (net->net.maxChannels == 0) continue;
 
     graph->inter[graph->nChannels*2] = net->id;
+    graph->latencyInter = net->net.latency;
+
     for (int i=0; i<system->nodes[NET].count; i++) {
       if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
           (system->nodes[NET].nodes[i].net.port == net->net.port)) {
@@ -637,7 +643,18 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
 /* User defined graph from XML file */
 /************************************/
 
-struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
+struct kvDict kvDictLinkType[] = {
+  { "LOC", PATH_LOC },
+  { "NVL", PATH_NVL },
+  { "NVB", PATH_NVB },
+  { "PIX", PATH_PIX },
+  { "PXB", PATH_PXB },
+  { "PXN", PATH_PXN },
+  { "PHB", PATH_PHB },
+  { "SYS", PATH_SYS },
+  { NULL, 0 }
+};
+
 ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int* inter = graph->inter+2*c;
@@ -677,6 +694,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+  if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
   const char* str;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
   NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
@@ -735,6 +753,7 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop
   NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
   NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
   NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
   const char* str;
   NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
   NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
@@ -768,12 +787,14 @@ float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0,
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 
 RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
-RCCL_PARAM(EnableMultipleSAT, "ENABLE_MULTIPLE_SAT", 0);
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
+  graph->crossNic = ncclParamCrossNic();
   int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
   graph->speedIntra = graph->speedInter = 0;
+  graph->latencyInter = 0;
   if (graph->crossNic == 2) graph->crossNic = 0;
   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
   graph->typeInter = PATH_PIX;
@@ -821,8 +842,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     // limit single node max channels when searching ring graph on Rome
     graph->maxChannels = 2;
   }
-  if (graph->collNet && !rcclParamEnableMultipleSAT())
-    graph->maxChannels = 1;
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
   // SPLIT_TREE works better on older archs.
@@ -890,19 +909,13 @@ search:
       goto search;
     }
     tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       goto search;
     }
     tmpGraph.typeInter = PATH_PIX;
 
-    // Try a simpler tree
-    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
-      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-      goto search;
-    }
-    tmpGraph.pattern = graph->pattern;
-
     if (crossNic && tmpGraph.crossNic == 0) {
       // Try again with crossNic if permitted
       tmpGraph.crossNic = crossNic;
@@ -910,6 +923,13 @@ search:
     }
     tmpGraph.crossNic = 0;
 
+    // Try a simpler tree
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+      goto search;
+    }
+    tmpGraph.pattern = graph->pattern;
+
     // Decrease speed until we find a solution
     if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
       tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
@@ -1014,17 +1034,66 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
+// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
+NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
+
+#include "comm.h"
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
   if (graph) {
     // Honor the net device in the graph
     int channel = channelId%graph->nChannels;
-    int ngpus = system->nodes[GPU].count;
+    int ngpus = comm->topo->nodes[GPU].count;
     int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
     *dev = graph->inter[channel*2+index];
+    NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+  } else if (peerRank == -1) {
+    return ncclInternalError;
   } else {
-    int64_t id;
-    NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
-    *dev = id;
+    // Start with our local NIC and local Rank
+    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
+    *proxyRank = rank;
+
+    int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+    // See whether we can use the remote rank preferred device.
+    if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
+      int netDev = comm->peerInfo[peerRank].netDev;
+      int n;
+      // Check that device exists on our node
+      if (ncclParamCrossNic() == 0) {
+        if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) {
+          WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
+          return ncclInvalidUsage;
+        }
+        *dev = netDev;
+      }
+      if (pxnLevel == 1) {
+        int g, n;
+        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
+        NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+        struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
+        if (gpu->paths[NET][n].type <= PATH_PXN) {
+          *dev = netDev;
+          NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+        }
+      } else if (pxnLevel == 2) {
+        // Check whether we can access it through our node-local GPU for that NIC.
+        for (int r=0; r<comm->localRanks; r++) {
+          int peerRank = comm->localRankToRank[r];
+          if (comm->peerInfo[peerRank].netDev == netDev) {
+            int g1, g2, n;
+            NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
+            NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
+            NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+            struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
+            if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
+              *proxyRank = peerRank;
+              *dev = netDev;
+              return ncclSuccess;
+            }
+          }
+        }
+      }
+    }
   }
   return ncclSuccess;
 }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 226217c1b7..7f13451227 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -22,11 +22,11 @@
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
+const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
 #else
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
 #endif
 
 /******************************************************************/
@@ -127,6 +127,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
     n->net.asic = 0ULL;
     n->net.port = NCCL_TOPO_UNDEF;
     n->net.width = 0.0;
+    n->net.latency = 0.0;
   }
   *node = n;
   return ncclSuccess;
@@ -338,13 +339,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
 
   ncclDebugNoWarn = NCCL_GRAPH;
   int mbps;
-  if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
   net->net.width = mbps / 8000.0;
-  if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
-  if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
-  if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
-  if (ncclCollNet && xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
   net->net.busId = busId;
   ncclDebugNoWarn = 0;
 
@@ -653,6 +655,16 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr
   }
   return ncclSuccess;
 }
+static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
+  }
+  return ncclSuccess;
+}
 
 
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
@@ -689,7 +701,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
   int netDevCount = 0;
-  if (ncclCollNet) {
+  if (collNetSupport()) {
     NCCLCHECK(collNetDevices(&netDevCount));
     for (int n=0; n<netDevCount; n++) {
       ncclNetProperties_t props;
@@ -718,6 +730,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
@@ -737,7 +750,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
   int g;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   int minType = PATH_SYS;
@@ -754,6 +767,13 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
     }
     if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
   }
+  if (count == 0) {
+    *id = -1;
+    free(nets);
+    return ncclSuccess;
+  }
+
+  int rr = system->nodes[GPU].nodes[g].gpu.dev;
   *id = nets[rr%count];
   free(nets);
   return ncclSuccess;
@@ -853,3 +873,14 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
   if (ccMax) *ccMax = max;
   return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *localRank = g;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find local GPU with rank %d\n", rank);
+  return ncclInternalError;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 321fb1e31d..56a468129d 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -45,9 +45,10 @@ extern const char* topoNodeTypeStr[];
 // Skipping 2 for PATH_NVB
 #define LINK_PCI 3
 // Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PHB
-#define LINK_SYS 6
-#define LINK_NET 7
+// Skipping 5 for PATH_PXN
+// Skipping 6 for PATH_PHB
+#define LINK_SYS 7
+#define LINK_NET 8
 extern const char* topoLinkTypeStr[];
 
 #define PATH_LOC 0
@@ -55,8 +56,10 @@ extern const char* topoLinkTypeStr[];
 #define PATH_NVB 2
 #define PATH_PIX 3
 #define PATH_PXB 4
-#define PATH_PHB 5
-#define PATH_SYS 6
+#define PATH_PXN 5
+#define PATH_PHB 6
+#define PATH_SYS 7
+#define PATH_DIS 7
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -102,6 +105,7 @@ struct ncclTopoNode {
       uint64_t asic;
       int port;
       float width;
+      float latency;
       int gdrSupport;
       int collSupport;
       int maxChannels;
@@ -149,8 +153,7 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id)
 ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
 ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
-
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
 
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
 ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 63d0443a39..ff04c58444 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -110,7 +110,7 @@ static struct tuningModel tuning_model_1 {
 
   .bwRatio =
   { /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
     /* more than 2 nodes */
     { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
   },
@@ -213,8 +213,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
   comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
-  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
 #endif
@@ -246,7 +245,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
       coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
       nRanks;
-    int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
+    int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
       coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
       nNodes;
 
@@ -269,7 +268,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
         if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
-        if (a == NCCL_ALGO_COLLNET) busBw *= .9;
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
 #endif
         if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Oneshot CollNet only supports Simple
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 6b2c2e2e65..9b4a610aed 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -617,7 +617,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
       if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
     } else {
-      NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+      NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
     }
     NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
   }
@@ -632,7 +632,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       CUDACHECK(hipGetDeviceProperties(&devProp, dev));
       cudaMajor = devProp.major; cudaMinor = devProp.minor;
     } else {
-      NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+      NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
     }
     NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
   }
@@ -703,15 +703,15 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
     for (int l=0; l<maxNvLinks; ++l) {
       // Check whether we can use this NVLink for P2P
       unsigned canP2P;
-      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+      if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
       // Make sure the Nvlink is up. The previous call should have trained the link.
       nvmlEnableState_t isActive;
-      if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+      if ((ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
 
       // Try to figure out what's on the other side of the NVLink
       nvmlPciInfo_t remoteProc;
-      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+      if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
       // Make a lower case copy of the bus ID for calling ncclDeviceType
       // PCI system path is in lower case
@@ -782,13 +782,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, NULL, xml, gpuNode));
 #else
   nvmlDevice_t nvmlDev = NULL;
-  static int nvmlInit = 0;
-  if (nvmlInit == 0) {
-    nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
-  }
-  if (nvmlInit == 1) {
-    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
-  }
+  if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
 #endif
   return ncclSuccess;
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 07d4c8d6ee..588cba20f1 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -95,6 +95,14 @@ static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtol(str, NULL, 0) : defaultValue;
+  return ncclSuccess;
+}
+
+
 static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
   const char* str;
   NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
diff --git a/src/group.cc b/src/group.cc
index 6fc12729f9..84cfb66b8a 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -56,21 +56,6 @@ struct ncclAsyncArgs {
 
 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
 
-#define NCCLCHECKTHREAD(a) do { \
-  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    return args; \
-  } \
-} while(0)
-
-#define CUDACHECKTHREAD(a) do { \
-  if ((a) != hipSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    args->ret = ncclUnhandledCudaError; \
-    return args; \
-  } \
-} while(0)
-
 void* ncclAsyncThreadMain(void* args_) {
   struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
   NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
@@ -120,18 +105,23 @@ ncclResult_t ncclGroupStart() {
   return ncclSuccess;
 }
 
-static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes,
-  void* recvbuff, ssize_t sendbytes, const void* sendbuff, uint16_t sendIdx, uint16_t recvIdx) {
-  struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
-    sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
+static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
     1, 1 };
-  info.delta = delta;
   info.channelId = channelId;
-  info.sendbytes = sendbytes;
-  info.recvbytes = recvbytes;
-  info.sendIdx = sendIdx;
-  info.recvIdx = recvIdx;
-  if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
+  info.opCount = opCount;
+  info.connIndex = connIndex;
+  NCCLCHECK(ncclSetupP2pKernel(&info));
+  return ncclSuccess;
+}
+static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
+    1, 1 };
+  info.channelId = channelId;
+  info.opCount = opCount;
+  info.connIndex = connIndex;
   NCCLCHECK(ncclSetupP2pKernel(&info));
   return ncclSuccess;
 }
@@ -203,15 +193,15 @@ ncclResult_t ncclGroupEnd() {
 
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
-      args->coll.connIndex = 0;
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
+      args->coll.connIndex = 1;
       pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
     }
   }
 
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
       int err = pthread_join(ncclGroupThreads[i], NULL);
       if (err != 0) {
         WARN("Error waiting for pthread_join : %s", strerror(errno));
@@ -219,7 +209,7 @@ ncclResult_t ncclGroupEnd() {
       }
       INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
       NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect[0] = 0;
+      args->coll.comm->connect[1] = 0;
     }
   }
 
@@ -271,18 +261,31 @@ ncclResult_t ncclGroupEnd() {
           int index = 0;
           int delta = deltas[index];
 sched_delta:
-          uint32_t from = (rank+nRanks-delta)%nRanks;
-          uint32_t to = (rank+delta)%nRanks;
-          struct ncclP2Pinfo* recv = comm->p2pRecvs[from] ? comm->p2pRecvs[from]->getNext() : NULL;
-          struct ncclP2Pinfo* send = comm->p2pSends[to] ? comm->p2pSends[to]->getNext() : NULL;
+          uint32_t recvPeer = (rank+nRanks-delta)%nRanks;
+          uint32_t sendPeer = (rank+delta)%nRanks;
+          struct ncclP2Pinfo* recv = comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
+          struct ncclP2Pinfo* send = comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
           if (recv != NULL || send != NULL) {
             ssize_t totRecvBytes = -1, totSendBytes = -1;
             if (recv != NULL) totRecvBytes = recv->nbytes;
             if (send != NULL) totSendBytes = send->nbytes;
+            if (recv) comm->p2pRecvCount--;
+            if (send) comm->p2pSendCount--;
+            if (recvPeer == comm->rank) { // Check self send/recv
+              if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
+              if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
+              if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
+            }
+            void* recvBuff = recv ? recv->buff : NULL;
+            void* sendBuff = send ? send->buff : NULL;
+            // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
+            if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
+            if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
+
             ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
             ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
 
-            uint16_t sendIdx = 0, recvIdx = 0;
+            uint16_t sendIdx = 1, recvIdx = 1;
             if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
               sendIdx = NCCL_CONN_IDX_P2P_NET;
             if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
@@ -299,23 +302,20 @@ sched_delta:
               if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
               if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
               // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
-              // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
-              if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
-              if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
-              if (sendbytes >= 0 || recvbytes >= 0) {
-                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
-                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
-                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL, sendIdx, recvIdx), ret, group_cleanup);
+              // (total size == 0), otherwise set size to -1.
+                if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
+                if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
+              if (recv) {
+                NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
+              }
+              if (send) {
+                NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
               }
               recvOffset += recvChunkSize;
               sendOffset += sendChunkSize;
               chunk++;
             } while (sendRemaining || recvRemaining);
-            if (recv) comm->p2pRecvCount--;
-            if (send) comm->p2pSendCount--;
           }
-          if (recv == NULL && comm->p2pRecvs[from]) comm->p2pRecvs[from]->recycle();
-          if (send == NULL && comm->p2pSends[to]) comm->p2pSends[to]->recycle();
           index++;
           if (index == 1 && deltas[1] == deltas[0]) index++;
           if (index == 2 && deltas[2] == deltas[0]) index++;
@@ -421,16 +421,6 @@ group_cleanup:
           }
           comm->p2pSendCount = comm->p2pRecvCount = 0;
         }
-        /* Free all proxy ops in state->nextOps */
-        struct ncclProxyState* state = &comm->proxyState;
-	pthread_mutex_lock(&state->poolMutex);
-	for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
-          op->next = state->pool;
-          state->pool = op;
-        }
-	pthread_mutex_unlock(&state->poolMutex);
-        state->nextOps = NULL;
-
         ncclLaunchReset(comm);
       }
     }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 49ef05d1e0..94c6b65627 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -31,16 +31,37 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) {
 }
 
 template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   void* p = malloc(nelem*sizeof(T));
   if (p == NULL) {
     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
     return ncclSystemError;
   }
+  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
   memset(p, 0, nelem*sizeof(T));
   *ptr = (T*)p;
   return ncclSuccess;
 }
+#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+  if (nelem < oldNelem) return ncclInternalError;
+  if (nelem == oldNelem) return ncclSuccess;
+
+  T* oldp = *ptr;
+  T* p = (T*)malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memcpy(p, oldp, oldNelem*sizeof(T));
+  free(oldp);
+  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  *ptr = (T*)p;
+  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  return ncclSuccess;
+}
 
 struct __attribute__ ((aligned(64))) allocationTracker {
   union {
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 6f3f02cdb4..76a2b5a14b 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,18 +9,17 @@
 #define NCCL_BOOTSTRAP_H_
 
 #include "nccl.h"
+#include "comm.h"
 
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
+ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
 ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
 #endif
diff --git a/src/include/checks.h b/src/include/checks.h
index 98cf164133..6efc2cf663 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -61,6 +61,49 @@
   } \
 } while(true)
 
+#define SYSCHECKGOTO(statement, res, label) do { \
+  if ((statement) == -1) {    \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#define NEQCHECK(statement, value) do {   \
+  if ((statement) != value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define NEQCHECKGOTO(statement, value, res, label) do { \
+  if ((statement) != value) { \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#define EQCHECK(statement, value) do {    \
+  if ((statement) == value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define EQCHECKGOTO(statement, value, res, label) do { \
+  if ((statement) == value) { \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
 // Propagate errors up
 #define NCCLCHECK(call) do { \
   ncclResult_t res = call; \
@@ -80,4 +123,39 @@
   } \
 } while (0);
 
+#define NCCLWAIT(call, cond, abortFlagPtr) do {         \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  ncclResult_t res = call;                \
+  if (res != ncclSuccess) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return ncclInternalError;             \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+} while (!(cond));
+
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  res = call;                             \
+  if (res != ncclSuccess) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label;                           \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
+} while (!(cond));
+
+#define NCCLCHECKTHREAD(a) do { \
+  if ((args->ret = (a)) != ncclSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    return args; \
+  } \
+} while(0)
+
+#define CUDACHECKTHREAD(a) do { \
+  if ((a) != hipSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    args->ret = ncclUnhandledCudaError; \
+    return args; \
+  } \
+} while(0)
+
 #endif
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index 0d17b76036..c2d831e916 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -29,6 +29,6 @@ static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK
 static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
 
-static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 518643cd14..db8fc99fd3 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -37,8 +37,8 @@ struct ncclDevRedOpFull {
 
 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
-  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(struct ncclWorkElem* args); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \
+  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
 
 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
@@ -96,18 +96,18 @@ DECL(AllReduce)
 DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
 DECL5(AllToAllPivot, RING, SIMPLE, Sum, int8_t)
 
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(struct ncclWorkElem* args);
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
 #if defined(RCCL_BFLOAT16)
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)(struct ncclWorkElem* args);
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)();
 #endif
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(struct ncclWorkElem* args);
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(struct ncclWorkElem* args);
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 //#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
diff --git a/src/include/comm.h b/src/include/comm.h
index 8c7ab14ff1..bdcfbf69ac 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,7 @@
 #include "transport.h"
 #include "p2p.h"
 // [RCCL]
-#include "clique/CliqueManager.h"
+//#include "clique/CliqueManager.h"
 // [/RCCL]
 
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -40,8 +40,6 @@ struct cudaLaunchParams {
 #define NCCL_LL128_THREAD_THRESHOLD 8
 #define NCCL_SIMPLE_THREAD_THRESHOLD 64
 
-#define NCCL_MAX_INTRA_RANKS 32
-
 struct ncclSendMem {
   union {
     struct {
@@ -50,10 +48,10 @@ struct ncclSendMem {
       void* ptrExchange;
       uint64_t redOpArgExchange[2];
       char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
+      int offsFifo[NCCL_STEPS];
     };
     char pad3[MEM_ALIGN];
   };
-  char buff[1]; // Actually larger than that
 };
 
 struct ncclRecvMem {
@@ -62,18 +60,18 @@ struct ncclRecvMem {
       uint64_t tail;
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       int sizesFifo[NCCL_STEPS];
-      void* ptrsFifo[NCCL_STEPS];
+      int offsFifo[NCCL_STEPS];
+      int flush; // For GDRCopy-based flush
     };
     char pad4[MEM_ALIGN];
   };
-  char buff[1]; // Actually larger than that
 };
 
 typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
 
 enum helperThreadState {ThreadStart, ThreadStop};
 
-#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS)
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
 
 struct ncclGraphHelperResources {
   ncclComm* comm;
@@ -91,6 +89,11 @@ struct ncclUserRedOp {
   ncclDevRedOpFull opFull;
 };
 
+struct ncclNodeRanks {
+  int localRanks;
+  int* localRankToRank;
+};
+
 struct ncclComm {
   struct ncclChannel channels[MAXCHANNELS];
 
@@ -108,15 +111,18 @@ struct ncclComm {
   int cudaDev; // my cuda device index
   int64_t busId;   // my PCI bus ID in int format
   cpu_set_t cpuAffinity; // CPU affinity of the GPU
+  int WarpSize;
 
   int node;
   int nNodes;
-
-  // Intra-node rank info
-  int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS];
+  int localRank;
   int localRanks;
-  int intraNodeRank;
-  int8_t* rankToIntraNodeRank;
+  int maxLocalRanks;
+  int* rankToNode;
+  int* rankToLocalRank;
+  int* localRankToRank;
+  // localRanks and localRanktoRank for all nodes
+  struct ncclNodeRanks* nodeRanks;
 
   enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
   hipStream_t userStream;
@@ -176,14 +182,13 @@ struct ncclComm {
   // Storage for deferred intra-process launch
   hipLaunchParams * intraParams;
   hipLaunchParams *myParams;
+  pthread_t* intraThreads;
   int* intraCudaDevs;
   int* intraCGMode; // Whether we can use CUDA9 CGMD or not
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
   struct ncclWorkElem args;
-  void* argsptr;
+  void* argsptrs[2];
 
-  // Global proxy thread
-  pthread_t proxyThread;
   struct ncclProxyState proxyState;
 
   // Whether this communicator uses collNet
@@ -205,8 +210,8 @@ struct ncclComm {
   int p2pRecvCount;
 
   // [RCCL]
-  CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
-  int rootPid;                     // Process ID of root
+  //CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
+  //int rootPid;                     // Process ID of root
   // [/RCCL]
 
   // Store info for cudaGraph
diff --git a/src/include/debug.h b/src/include/debug.h
index 6ce90ee375..7af38fd53d 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,9 @@
 #include <string.h>
 #include <pthread.h>
 
+// Conform to pthread and NVTX standard
+#define NCCL_THREAD_NAMELEN 16
+
 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
@@ -37,4 +40,6 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define TRACE(...)
 #endif
 
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
+
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 26936f530b..22cbd94c5d 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,7 +13,7 @@
 #include "align.h"
 #include <stdint.h>
 // [RCCL] Support for clique-based kernels
-#include "clique/CliqueCommon.h"
+//#include "clique/CliqueCommon.h"
 // [/RCCL]
 
 // Convert volatile access to atomic
@@ -27,7 +27,7 @@
 
 
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
 
 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
@@ -114,7 +114,7 @@ struct ncclConnInfo {
   uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
 
   int *sizesFifo;     // Sizes fifo from GPU to proxy
-  void* *ptrsFifo;      // Buffer fifo from proxy to GPU
+  int *offsFifo;      // Buffer fifo from proxy to GPU
 
   uint64_t step;      // Keep where we are
   uint64_t llLastCleaning;
@@ -126,10 +126,16 @@ struct ncclConnInfo {
   uint32_t* curr_hdp_reg;  // Current GPU's HDP register
 };
 
+struct ncclProxyConnector {
+  int rank;
+  int localRank;
+  struct ncclProxyConnection* connection;
+  struct ncclComm* comm;
+};
+
 struct ncclConnector {
   int connected;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclProxyArgs **proxyAppendPtr;
+  struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
@@ -180,90 +186,98 @@ struct ncclDevComm;
 
 #pragma pack(push)  /* push current alignment to stack */
 #pragma pack(8)     /* set alignment to 8 bytes boundary */
-#define NCCL_MAX_WORK_ELEMENTS 1
-#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-
 /* ncclWork is to be a power of two, currently 8x64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
 /* Make sure to adjust padding at the end of ncclWorkElem. */
-struct ncclWorkElem {
-  // Header
-  struct ncclDevComm* comm;
-  uint16_t nThreads;
+#define NCCL_WORK_SIZE 256
+
+enum ncclWorkElemType : uint8_t {
+   ncclWorkTypeUnused=0,
+   ncclWorkTypeColl=1,
+   ncclWorkTypeP2p=2,
+   ncclWorkTypeRegColl=3
+};
+enum ncclWorkElemSubType : uint8_t {
+  ncclWorkSubTypeUnused =0,
+  ncclWorkSubTypeSend,
+  ncclWorkSubTypeRecv
+};
+
+struct ncclWorkElemHeader {
   uint16_t funcIndex;
+  enum ncclWorkElemType type;
+  uint8_t nWarps:5;
+  uint8_t isLast:1;
+};
+
+struct ncclWorkElem {
+  struct ncclWorkElemHeader header;
   uint8_t regUsed;
   uint8_t direct;
-  uint8_t active, redOpArgIsPtr;
+  uint8_t redOpArgIsPtr;
+  uint8_t pad_0;
 
   const void * sendbuff;
   void * recvbuff;
 
-  // Op-specific fields.
+  size_t count;
   union {
-    struct {
-      size_t count;
-      union {
-        size_t lastChunkSize;
-        // Pivot A2A kernel computes chunk size itself.
-        // Instead, it needs the number of bidirectional rings.
-        size_t pivotA2ANumBiRings;
-      };
-      uint64_t redOpArg;
-      uint16_t root;
-      uint8_t bid;
-      uint8_t nChannels;
-      uint16_t connIndex;
-      uint16_t opCount;
-    } coll;
-    struct {
-      size_t sendCount;
-      size_t recvCount;
-      int sendChunkSize;
-      int recvChunkSize;
-      int32_t delta;
-      union {
-        struct {
-          uint16_t nThreads:12;
-          uint16_t sendIdx:2;
-          uint16_t recvIdx:2;
-        };
-        uint16_t padding;
-      };
-      uint16_t opCount;
-    } p2p;
-    // [RCCL] Clique-based arguments
-    //        NOTE: Follows same field structure as coll
-    //              because nChannels is accessed from "coll" struct.
-    struct {
-      size_t count;
-      cliqueDevicePtrs_t* ptrs;
-      uint64_t unused_1;
-      uint16_t unused_2;
-      uint8_t bid;
-      uint8_t nChannels;
-    } clique;
-    // [/RCCL]
-    uint64_t align[4];
+    size_t lastChunkSize;
+    // Pivot A2A kernel computes chunk size itself.
+    // Instead, it needs the number of bidirectional rings.
+    size_t pivotA2ANumBiRings;
   };
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t connIndex;
+  uint64_t redOpArg;
+  uint64_t opCount;
 };
-static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
 
-struct ncclWorkRegElem {
+struct ncclWorkElemP2p {
+  struct ncclWorkElemHeader header;
+  int32_t peer;
+  void* buff;
+  size_t count;
+  int chunkSize;
+  uint8_t ngroups:4;
+  uint8_t warpStart:4;
+  uint8_t nWarps:4;
+  enum ncclWorkElemSubType subType:4;
+  uint16_t opCount:12;
+  uint16_t connIndex:4;
+};
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
+
+struct ncclWorkElemReg {
   struct ncclWorkElem elem;
   void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
   void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
   void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
-#define NCCL_REG_ELEM_FACTOR 4
-static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
+static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
+
+#define NCCL_MAX_WORK_ELEMENTS 1
+#define NCCL_MAX_WORK_ELEMENTS_P2P 2
+#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
 
 struct ncclWork {
   union {
+    char pad[NCCL_WORK_SIZE];
+    struct ncclWorkElemHeader header;
     struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-    struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR];
+    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
+    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
   };
 };
 
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
+
 struct ncclChannel {
   union {
     struct {
@@ -309,10 +323,9 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must
 struct ncclProfElem {
   union {
     struct {
+      uint64_t opCount;
       uint64_t total_cycle;
       uint64_t wait_cycle;      // total wait cycle
-      uint64_t wait_send_cycle;
-      uint64_t wait_recv_cycle;
       // primtive cycles
       uint64_t send_cycle;
       uint64_t directSend_cycle;
@@ -341,22 +354,26 @@ struct ncclProfElem {
       uint64_t directRecvReduceCopySend_byte;
     };
     int data[0x80];
-  };
+  } elem[MAXCHANNELS];
 };
 
 struct ncclProf {
-  struct ncclProfElem elems[MAXCHANNELS];
+  struct ncclProfElem* elems;
 };
+
+#define PROFILE_NUM_ITEMS 1024
 #endif
 
 #ifdef ENABLE_COLLTRACE
 typedef enum {
-  ncclCollTraceNotReady,
-  ncclCollTraceKernelLaunchType,
-  ncclCollTraceKernelEndType,
-  ncclCollTraceCollEndType,
-  ncclCollTraceAbortType,
-  ncclCollTraceDataType
+  ncclCollTraceNotReady = 0,
+  ncclCollTraceKernelLaunchType = 1,
+  ncclCollTraceKernelEndType = 2,
+  ncclCollTraceCollLaunchType = 3,
+  ncclCollTraceAbortType = 4,
+  ncclCollTraceDataType = 5,
+  ncclCollTraceCollElemType = (1<<4),
+  ncclCollTraceP2pElemType = (1<<5),
 } ncclCollTraceDataType_t;
 
 struct ncclCollTrace {
@@ -365,18 +382,24 @@ struct ncclCollTrace {
   int16_t funcIndex;
   uint32_t data_0;
   uint64_t timeStamp;
-  uint64_t opCount;
+  union {
+    uint64_t opCount;
+    uint32_t p2pOpCount[2];
+  };
   union {
     uint64_t data_1;
     struct {
-      uint16_t nThreads;
+      uint8_t nWarps;
       uint8_t bid;
       uint8_t nChannels;
     } coll;
     struct {
-      uint16_t nThreads;
-      uint16_t delta;
-    } p2p;
+      int16_t peer;
+      uint8_t ngroups:4;
+      uint8_t connIndex:4;
+      uint8_t warpStart:4;
+      uint8_t nWarps:4;
+    } p2p[2];
   };
 };
 static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
@@ -397,7 +420,7 @@ struct ncclDevComm {
 
 #ifdef ENABLE_PROFILING
   // Profiling counters
-  struct ncclProf* devProf;
+  struct ncclProf devProf;
 #endif
 
 #ifdef ENABLE_COLLTRACE
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index c6674a83ef..d538a1da77 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,7 @@
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
 
 size_t ncclKernMaxLocalSize();
+ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
@@ -32,17 +33,17 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph);
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph);
 
 struct ncclBuffRegInfo {
-  void* sendbuffsBase[NCCL_MAX_INTRA_RANKS];
-  void* recvbuffsBase[NCCL_MAX_INTRA_RANKS];
-  void* sendbuffs[NCCL_MAX_INTRA_RANKS];
-  void* recvbuffs[NCCL_MAX_INTRA_RANKS];
+  void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
+  void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
+  void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
+  void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
   int nBuffs;
 };
 
 // Enqueue information (for kernel and proxy) for each operation
 struct ncclQueueElem {
-  struct ncclWorkElem work;
-  struct ncclProxyArgs proxyArgs;
+  struct ncclWork work;
+  struct ncclProxyOp proxyOp;
   struct ncclBuffRegInfo buffRegInfo;
 };
 
@@ -88,7 +89,7 @@ static void ncclDestroyQueueInfo(void* ptr) {
   // but currently the destroy function of CUDA objects does not allow CUDA API calls
   while (eqElem != NULL) {
     for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (i == eqInfo->comm->intraNodeRank) continue;
+      if (i == eqInfo->comm->localRank) continue;
       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
     }
diff --git a/src/include/graph.h b/src/include/graph.h
index 29f81864f7..4cfe9539a6 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -31,12 +31,15 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 
 // Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
 #define MAX_XGMI_INTER_GPUS 4
 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
 ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
+int ncclPxnDisable();
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
 
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -54,6 +57,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
 
 #define NCCL_TOPO_MAX_NODES 256
 
@@ -76,6 +80,7 @@ struct ncclTopoGraph {
   int nChannels;
   float speedIntra;
   float speedInter;
+  float latencyInter;
   int typeIntra;
   int typeInter;
   int sameChannels;
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4ec1ac6d4e..63555baf80 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -328,7 +328,8 @@ enum ibv_access_flags {
 	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
 	IBV_ACCESS_REMOTE_READ		= (1<<2),
 	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IBV_ACCESS_MW_BIND		= (1<<4)
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
 };
 
 struct ibv_pd {
@@ -1065,6 +1066,7 @@ ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context)
 ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/src/include/info.h b/src/include/info.h
index 08a80f69e7..b380389242 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,7 +12,7 @@
 #include "devcomm.h"
 #include "collectives.h"
 
-typedef enum {
+typedef enum : uint8_t {
   ncclPatternRing,
   ncclPatternRingTwice,
   ncclPatternPipelineFrom,
@@ -20,7 +20,9 @@ typedef enum {
   ncclPatternTreeUp,
   ncclPatternTreeDown,
   ncclPatternTreeUpDown,
-  ncclPatternCollTreeUpDown
+  ncclPatternCollTreeUpDown,
+  ncclPatternSend,
+  ncclPatternRecv
 } ncclPattern_t;
 
 // Used to pass NCCL call information between functions
@@ -33,7 +35,7 @@ struct ncclInfo {
   size_t count;
   ncclDataType_t datatype;
   ncclRedOp_t op;
-  int root;
+  int root; // peer for p2p operations
   ncclComm_t comm;
   hipStream_t stream;
   // Algorithm details
@@ -49,14 +51,10 @@ struct ncclInfo {
   size_t nBytes;
   int nstepsPerLoop;
   int nchunksPerLoop;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-  int recvChunkSize;
-  int sendChunkSize;
-  uint32_t delta;
+  int chunkSize;
   int channelId;
-  uint16_t sendIdx;
-  uint16_t recvIdx;
+  uint16_t connIndex;
+  uint64_t opCount;
 };
 
 #endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 389c1eaa93..ce616724cd 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +10,7 @@
 #include "nccl.h"
 #include <stdint.h>
 
-#define NCCL_NET_HANDLE_MAXSIZE 64
+#define NCCL_NET_HANDLE_MAXSIZE 128
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -31,10 +31,114 @@ typedef struct {
   int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
   int speed;      // Port speed in Mbps.
   int port;       // Port number.
+  float latency;  // Network latency
   int maxComms;   // Maximum number of comms we can create
-}ncclNetProperties_v4_t;
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v5_t;
 
-typedef ncclNetProperties_v4_t ncclNetProperties_t;
+typedef ncclNetProperties_v5_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+typedef ncclNet_v5_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+typedef ncclCollNet_v5_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -75,10 +179,6 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;
 
-typedef ncclNet_v4_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
-
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
@@ -117,8 +217,4 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v4_t;
 
-typedef ncclCollNet_v4_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
-
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index 10a2d85432..0cc50678b5 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,10 +9,14 @@
 
 #include "nccl.h"
 #include "nccl_net.h"
+#include "checks.h"
 
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
+ncclResult_t ncclNetInit();
+int ncclNetVersion();
+
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -23,60 +26,16 @@ static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCC
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
 // Test whether the current GPU support GPU Direct RDMA.
-#define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
-  int netDevs;
-  NCCLCHECK(ncclNetDevices(&netDevs));
-  *gdrSupport = 0;
-  for (int dev=0; dev<netDevs; dev++) {
-    // Find a net device which is GDR-capable
-    ncclNetProperties_t props;
-    NCCLCHECK(ncclNet->getProperties(dev, &props));
-    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    *gdrSupport = 1;
-    break;
-#endif
-
-    // Allocate memory on the GPU and try to register it on the NIC.
-    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
-    ncclNetHandle_t handle;
-    void* gpuPtr = NULL;
-    void* mHandle = NULL;
-    ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
-    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
-    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
-    CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
-    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
-      *gdrSupport = 1;
-    }
-    ncclDebugNoWarn = 0;
-    CUDACHECK(hipFree(gpuPtr));
-cleanup4:
-    NCCLCHECK(ncclNetCloseRecv(rComm));
-cleanup3:
-    NCCLCHECK(ncclNetCloseSend(sComm));
-cleanup2:
-    NCCLCHECK(ncclNetCloseListen(lComm));
-cleanup1:
-    break;
-  }
-  return ncclSuccess;
-}
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
 
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 21ee82eaf8..29731dd835 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,59 +9,13 @@
 
 #include "nccl.h"
 
-// The NVML library doesn't appear to be thread safe
-#include <pthread.h>
-extern pthread_mutex_t nvmlLock;
-#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
-#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
+//#define NCCL_NVML_DIRECT 1
+#ifndef NCCL_NVML_DIRECT
+#define NCCL_NVML_DIRECT 0
+#endif
 
-#define NVMLLOCKCALL(cmd, ret) do {                      \
-    NVMLLOCK();                                          \
-    ret = cmd;                                           \
-    NVMLUNLOCK();                                        \
-} while(false)
-
-#define NVMLCHECK(cmd) do {                              \
-    nvmlReturn_t e;                                      \
-    NVMLLOCKCALL(cmd, e);                                \
-    if( e != NVML_SUCCESS ) {                            \
-      WARN("NVML failure '%s'", nvmlErrorString(e));     \
-      return ncclSystemError;                            \
-    }                                                    \
-} while(false)
-
-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
+#if NCCL_NVML_DIRECT
 #include "nvml.h"
-
-static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
-static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
-static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
-static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  NVMLCHECK(nvmlDeviceGetIndex(device, index));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
-  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
-  NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult) {
-  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
-  NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
-  return ncclSuccess;
-}
 #else
 // Dynamically handle dependencies on NVML
 
@@ -129,21 +83,56 @@ typedef struct nvmlPciInfo_st
     unsigned int reserved2;
     unsigned int reserved3;
 } nvmlPciInfo_t;
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+    NVML_P2P_STATUS_OK     = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
+
 /* End of nvml.h */
+#endif // NCCL_NVML_DIRECT
 
-ncclResult_t wrapNvmlSymbols(void);
+constexpr int ncclNvmlMaxDevices = 32;
+struct ncclNvmlDeviceInfo {
+  nvmlDevice_t handle;
+  int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct ncclNvmlDevicePairInfo {
+  nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int ncclNvmlDeviceCount;
+extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
 
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
-
-#endif // NVML_DIRECT
+// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the ncclNvml global
+// tables above.
+ncclResult_t ncclNvmlEnsureInitialized();
 
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 #endif // End include guard
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 2519873c20..7430a722e7 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -12,16 +12,18 @@
 struct ncclP2Pinfo {
   void* buff;
   ssize_t nbytes;
+  uint64_t opCount;
 };
 
 typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
 
-static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) {
+static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes, uint64_t opCount) {
   if (p2p == NULL) p2p = new ncclP2Plist();
   struct ncclP2Pinfo* next;
   NCCLCHECK(p2p->getNewElem(&next));
   next->buff = buff;
   next->nbytes = nBytes;
+  next->opCount = opCount;
   return ncclSuccess;
 }
 #endif
diff --git a/src/include/param.h b/src/include/param.h
index ca992d71c8..ca243ca6fb 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,77 +8,24 @@
 #ifndef NCCL_PARAM_H_
 #define NCCL_PARAM_H_
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <pwd.h>
+#include <stdint.h>
 
-static const char* userHomeDir() {
-  struct passwd *pwUser = getpwuid(getuid());
-  return pwUser == NULL ? NULL : pwUser->pw_dir;
-}
+const char* userHomeDir();
+void setEnvFile(const char* fileName);
+void initEnv();
 
-static void setEnvFile(const char* fileName) {
-  FILE * file = fopen(fileName, "r");
-  if (file == NULL) return;
+void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
 
-  char *line = NULL;
-  char envVar[1024];
-  char envValue[1024];
-  size_t n = 0;
-  ssize_t read;
-  while ((read = getline(&line, &n, file)) != -1) {
-    if (line[read-1] == '\n') line[read-1] = '\0';
-    int s=0; // Env Var Size
-    while (line[s] != '\0' && line[s] != '=') s++;
-    if (line[s] == '\0') continue;
-    strncpy(envVar, line, std::min(1023,s));
-    envVar[s] = '\0';
-    s++;
-    strncpy(envValue, line+s, 1023);
-    envValue[1023]='\0';
-    setenv(envVar, envValue, 0);
-  }
-  if (line) free(line);
-  fclose(file);
-}
-
-static void initEnv() {
-  char confFilePath[1024];
-  const char * userDir = userHomeDir();
-  if (userDir) {
-    sprintf(confFilePath, "%s/.nccl.conf", userDir);
-    setEnvFile(confFilePath);
-  }
-  sprintf(confFilePath, "/etc/nccl.conf");
-  setEnvFile(confFilePath);
-}
-
-
-#define NCCL_PARAM(name, env, default_value) \
-pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
-int64_t ncclParam##name() { \
-  static_assert(default_value != -1LL, "default value cannot be -1"); \
-  static int64_t value = -1LL; \
-  pthread_mutex_lock(&ncclParamMutex##name); \
-  if (value == -1LL) { \
-    value = default_value; \
-    char* str = getenv("NCCL_" env); \
-    if (str && strlen(str) > 0) { \
-      errno = 0; \
-      int64_t v = strtoll(str, NULL, 0); \
-      if (errno) { \
-        INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
-      } else { \
-        value = v; \
-        INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
-      } \
+#define NCCL_PARAM(name, env, deftVal) \
+  int64_t ncclParam##name() { \
+    constexpr int64_t uninitialized = INT64_MIN; \
+    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
+    static int64_t cache = uninitialized; \
+    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
+      ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
     } \
-  } \
-  pthread_mutex_unlock(&ncclParamMutex##name); \
-  return value; \
-}
+    return cache; \
+  }
 
 #define RCCL_PARAM(name, env, default_value) \
 pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
diff --git a/src/include/profiler.h b/src/include/profiler.h
new file mode 100644
index 0000000000..103af99adf
--- /dev/null
+++ b/src/include/profiler.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include "proxy.h"
+
+enum ncclProxyProfileState {
+  ncclProxyProfileBegin = 0,
+
+  ncclProxyProfileSendGPUWait = 1,
+  ncclProxyProfileSendWait = 2,
+
+  ncclProxyProfileRecvWait = 1,
+  ncclProxyProfileRecvFlushWait = 2,
+  ncclProxyProfileRecvGPUWait = 3,
+
+  ncclProxyProfileEnd = 4,
+
+  ncclProxyProfileSleep = 8,
+  ncclProxyProfileWakeup = 9,
+
+  ncclProxyProfileIdle = 16,
+  ncclProxyProfileActive = 17,
+
+  ncclProxyProfileAppend = 24,
+  ncclProxyProfileAppendEnd = 25
+};
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
+void ncclProfilingDump();
+
+#endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 1cae10e533..1cf88d7b1c 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,27 +8,47 @@
 #ifndef NCCL_PROXY_H_
 #define NCCL_PROXY_H_
 
+#include "devcomm.h"
+#include "info.h"
+#include "socket.h"
 #include <pthread.h>
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
 
 struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
 
 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
 static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
 
-struct ncclProxySubArgs {
-  struct ncclChannel* channel;
-  struct ncclConnector* connector;
+struct ncclProxyOp {
+  struct ncclProxyConnection* connection;
+  int channelId;
   int nsteps;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-  int sendChunkSize;
-  int recvChunkSize;
-  int delta;
+  ssize_t nbytes;
+  int root;
+  int next;
 
-  // Internal state
+  uint64_t opCount;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
+  ncclPattern_t pattern; // uint8_t
+  uint8_t protocol;
+  uint16_t connIndex;
+};
+static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
+
+struct ncclProxySubArgs {
+  struct ncclProxyConnection* connection;
+  int channelId;
+  int nsteps;
+  ssize_t nbytes;
+  int peer;
+
+  int groupSize; // Number of consecutive sub operations sharing the same recvComm
   uint64_t base;
   uint64_t posted;
   uint64_t received;
@@ -37,67 +57,128 @@ struct ncclProxySubArgs {
   uint64_t done;
   uint64_t end;
   void* requests[NCCL_STEPS];
+  void* profilingEvents[NCCL_STEPS];
 };
 
 struct ncclProxyArgs {
-  proxyProgressFunc_t progress;
   struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+  proxyProgressFunc_t progress;
   int nsubs;
   int done;
+  uint64_t opCount;
   int sliceSteps;
   int chunkSteps;
   int chunkSize;
-  uint64_t opCount;
-  uint64_t commOpCount;
-  int protocol;
   ncclDataType_t dtype;
   ncclRedOp_t redOp;
   ncclPattern_t pattern;
-  int root;
+  uint8_t protocol;
   int state;
   char* sharedBuff[NCCL_STEPS];
   int sharedSize[NCCL_STEPS];
 
   int idle;
   uint64_t hdp_flushed;
-  uint8_t connIndex;
-  uint8_t sendIdx;
-  uint8_t recvIdx;
 
   // Element linking
-  pthread_mutex_t mutex;
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
   struct ncclProxyArgs** proxyAppendPtr;
 };
+#define NCCL_MAX_NETDEVS 128
 
-struct ncclProxySharedBuffers {
+// ProxyOps are used to communicate between main thread and service thread
+// Make sure we have enough to store two full rounds of operations on all channels.
+// Otherwise we'd be unable to post half of them to free new elements.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+#define NCCL_MAX_LOCAL_RANKS 64
+struct ncclProxyOpsPool {
+  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
+  volatile int nextOps;
+  volatile int nextOpsEnd;
+  volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+};
+
+struct ncclProxyOps {
+  ncclProxyOpsPool* pool;
+  int count;
+  int freeOp;
+  int nextOps;
+  int nextOpsEnd;
+};
+
+struct ncclProxySharedP2p {
+  int refcount;
   int size;
   char* cudaBuff;
   char* hostBuff;
-  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
-  // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
-  struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
-  void* collNetResources;
+  hipIpcMemHandle_t ipc;
+  struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
+};
+
+struct ncclProxySharedCollNet {
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+  void* resources;
+};
+
+struct ncclProxyPeer {
+  struct ncclProxySharedP2p send;
+  struct ncclProxySharedP2p recv;
+};
+
+struct ncclSharedNetComms {
+  void* sendComm[MAXCHANNELS];
+  void* recvComm[MAXCHANNELS];
+  int sendRefCount[MAXCHANNELS];
+  int recvRefCount[MAXCHANNELS];
 };
 
 struct ncclProxyPool;
-struct ncclProxyState {
-  pthread_cond_t cond;
-  pthread_mutex_t opsMutex;
-  pthread_mutex_t poolMutex;
-  bool stop;
-  struct ncclProxySharedBuffers sharedBuffs;
-  struct ncclProxyArgs* ops;           // Running operations, used by proxy thread
-  struct ncclProxyArgs* postedOps;     // Posted operations, shared between proxy and main thread, locked with opsMutex
-  struct ncclProxyArgs* postedOpsEnd;
-  struct ncclProxyArgs* nextOps;       // Pending operations, used by main thread (could still be cancelled)
-  struct ncclProxyArgs* nextOpsEnd;
-  struct ncclProxyArgs* pool;          // Free operations for main thread
-  struct ncclProxyArgs* poolFreed;     // Freed operations by the progress thread
-  struct ncclProxyArgs* poolReturned;  // Shared between main and progress thread, lock with poolMutex
+struct ncclProxyProgressState {
+  // Used by main threads to send work to progress thread
+  struct ncclProxyOpsPool* opsPool;
+  char opsPoolShmSuffix[6];
 
+  pthread_t thread;
+  bool stop;
+  struct ncclProxyPeer** localPeers;
+  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
+  struct ncclProxySharedCollNet collNet;
+  struct ncclProxyArgs* active;
+  struct ncclProxyArgs* pool;
   struct ncclProxyPool* pools;
+  int nextOps;
+};
+
+struct ncclProxyState {
+  // Service thread
+  pthread_t thread;
+  struct ncclSocket* listenSock;
+  int stop;
+
+  // Used by main thread
+  union ncclSocketAddress* peerAddresses;
+  struct ncclSocket* peerSocks;
+  struct ncclProxyOps* proxyOps;
+  void** sharedDevMems;
+
+  // Progress thread
+  struct ncclProxyProgressState progressState;
+};
+
+struct ncclProxyConnection {
+  int send, transport, shared;
+  int localRank;
+  struct ncclSocket* sock;
+  struct ncclTransportComm* tcomm;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclProxyArgs **proxyAppendPtr;
+  void* transportResources;
 };
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -108,26 +189,25 @@ enum proxyMode {
   proxyTo = 2
 };
 
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
+enum ncclProxyMsgType {
+  ncclProxyMsgInit = 1,
+  ncclProxyMsgSharedInit = 2,
+  ncclProxyMsgSetup = 3,
+  ncclProxyMsgConnect = 4,
+  ncclProxyMsgStart = 5,
+  ncclProxyMsgClose = 6,
+  ncclProxyMsgAbort = 7,
+  ncclProxyMsgStop = 8
+};
+
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
-
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
-
-#include <unistd.h>
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-inline void transportProxyWait(const FUNC& func) {
-  while (!func()) {
-    sched_yield();
-  }
-}
-
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
 #endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 0b93995089..08dc8495fd 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,65 +7,9 @@
 #ifndef NCCL_SHM_H_
 #define NCCL_SHM_H_
 
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-// Change functions behavior to match other SYS functions
-static int shm_allocate(int fd, const int shmsize) {
-  int err = posix_fallocate(fd, 0, shmsize);
-  if (err) { errno = err; return -1; }
-  return 0;
-}
-static int shm_map(int fd, const int shmsize, void** ptr) {
-  *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  return (*ptr == MAP_FAILED) ? -1 : 0;
-}
-
-static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
-  SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd);
-  if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
-  SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
-  close(*fd);
-  *fd = -1;
-  if (create) memset(*ptr, 0, shmsize);
-  return ncclSuccess;
-}
-
-static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
-  int fd = -1;
-  void* ptr = MAP_FAILED;
-  ncclResult_t res = ncclSuccess;
-
-  NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
-  CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError);
-  CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError);
-
-  *shmPtr = ptr;
-  return ncclSuccess;
-sysError:
-  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
-hipError:
-  if (fd != -1) close(fd);
-  if (create) shm_unlink(shmname);
-  if (ptr != MAP_FAILED) munmap(ptr, shmsize);
-  *shmPtr = NULL;
-  return res;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
-  if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
-  return ncclSuccess;
-}
-
-static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
-  CUDACHECK(hipHostUnregister(shmPtr));
-  if (munmap(shmPtr, shmsize) != 0) {
-    WARN("munmap of shared memory failed");
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
+#include "nccl.h"
 
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
+ncclResult_t ncclShmUnlink(const char* shmname);
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 2dbaaa9f36..d72480b6bb 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,14 +7,13 @@
 #ifndef NCCL_SOCKET_H_
 #define NCCL_SOCKET_H_
 
+#include "nccl.h"
 #include <sys/socket.h>
 #include <arpa/inet.h>
 #include <netinet/tcp.h>
-#include <unistd.h>
 #include <netdb.h>
-#include <ifaddrs.h>
-#include <net/if.h>
-#include "utils.h"
+#include <fcntl.h>
+#include <poll.h>
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
@@ -25,443 +23,48 @@
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 
 /* Common socket address storage structure for IPv4/IPv6 */
-union socketAddress {
+union ncclSocketAddress {
   struct sockaddr sa;
   struct sockaddr_in sin;
   struct sockaddr_in6 sin6;
 };
 
-/* Format a string representation of a (union socketAddress *) socket address using getnameinfo()
- *
- * Output: "IPv4/IPv6 address<port>"
- */
-static inline const char *socketToString(union socketAddress *addr, char *buf) {
-  if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
-  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
-  char host[NI_MAXHOST], service[NI_MAXSERV];
-  (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
-  sprintf(buf, "%s<%s>", host, service);
-  return buf;
-}
+enum ncclSocketState {
+  ncclSocketConnecting = 0,
+  ncclSocketConnected = 1,
+  ncclSocketError = 2,
+  ncclSocketStateNum = 3
+} ;
 
-static inline uint16_t socketToPort(union socketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
-  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
-}
+struct ncclSocket {
+  int fd;
+  union ncclSocketAddress addr;
+  volatile uint32_t* abortFlag;
+  int asyncFlag;
+  enum ncclSocketState state;
+};
 
-/* Allow the user to force the IPv4/IPv6 interface selection */
-static inline int envSocketFamily(void) {
-  int family = -1; // Family selection is not forced, will use first one found
-  char* env = getenv("NCCL_SOCKET_FAMILY");
-  if (env == NULL)
-    return family;
-
-  INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
-
-  if (strcmp(env, "AF_INET") == 0)
-    family = AF_INET;  // IPv4
-  else if (strcmp(env, "AF_INET6") == 0)
-    family = AF_INET6; // IPv6
-  return family;
-}
-
-static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-  struct netIf userIfs[MAX_IFS];
-  bool searchNot = prefixList && prefixList[0] == '^';
-  if (searchNot) prefixList++;
-  bool searchExact = prefixList && prefixList[0] == '=';
-  if (searchExact) prefixList++;
-  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
-
-  int found = 0;
-  struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString((union socketAddress *)interface->ifa_addr, line));
-
-    /* Allow the caller to force the socket family type */
-    if (sock_family != -1 && family != sock_family)
-      continue;
-
-    /* We also need to skip IPv6 loopback interfaces */
-    if (family == AF_INET6) {
-      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
-      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
-    }
-
-    // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
-      continue;
-    }
-
-    // Check that this interface has not already been saved
-    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
-    bool duplicate = false;
-    for (int i = 0; i < found; i++) {
-      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
-    }
-
-    if (!duplicate) {
-      // Store the interface name
-      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
-      // Store the IP address
-      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-      memcpy(addrs+found, interface->ifa_addr, salen);
-      found++;
-    }
-  }
-
-  freeifaddrs(interfaces);
-  return found;
-}
-
-static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
-  /* Check family first */
-  int family = local_if.ifa_addr->sa_family;
-  if (family != remote->sa.sa_family) {
-    return false;
-  }
-
-  if (family == AF_INET) {
-    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
-    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
-    struct sockaddr_in& remote_addr = remote->sin;
-    struct in_addr local_subnet, remote_subnet;
-    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
-    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
-    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
-  } else if (family == AF_INET6) {
-    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
-    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
-    struct sockaddr_in6& remote_addr = remote->sin6;
-    struct in6_addr& local_in6 = local_addr->sin6_addr;
-    struct in6_addr& mask_in6 = mask->sin6_addr;
-    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
-    bool same = true;
-    int len = 16;  //IPv6 address is 16 unsigned char
-    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
-      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      if (c1 ^ c2) {
-        same = false;
-        break;
-      }
-    }
-    // At last, we need to compare scope id
-    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
-    // For Global type, this field is 0, so a comparison wouldn't matter
-    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
-    return same;
-  } else {
-    WARN("Net : Unsupported address family type");
-    return false;
-  }
-}
-
-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-  char line_a[SOCKET_NAME_MAXLEN+1];
-  int found = 0;
-  struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    // check against user specified interfaces
-    if (!matchSubnet(*interface, remoteAddr)) {
-      continue;
-    }
-
-    // Store the local IP address
-    int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-    memcpy(localAddrs+found, interface->ifa_addr, salen);
-
-    // Store the interface name
-    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
-
-    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(localAddrs+found, line), socketToString(remoteAddr, line_a));
-    found++;
-    if (found == maxIfs) break;
-  }
-
-  if (found == 0) {
-    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(remoteAddr, line_a));
-  }
-  freeifaddrs(interfaces);
-  return found;
-}
-
-static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
-  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
-    WARN("Net : string is null");
-    return ncclInvalidArgument;
-  }
-
-  bool ipv6 = ip_port_pair[0] == '[';
-  /* Construct the sockaddress structure */
-  if (!ipv6) {
-    struct netIf ni;
-    // parse <ip_or_hostname>:<port> string, expect one pair
-    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
-      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
-      return ncclInvalidArgument;
-    }
-
-    struct addrinfo hints, *p;
-    int rv;
-    memset(&hints, 0, sizeof(hints));
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_socktype = SOCK_STREAM;
-
-    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
-      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
-      return ncclInvalidArgument;
-    }
-
-    // use the first
-    if (p->ai_family == AF_INET) {
-      struct sockaddr_in& sin = ua->sin;
-      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
-      sin.sin_family = AF_INET;                        // IPv4
-      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
-      sin.sin_port = htons(ni.port);                   // port
-    } else if (p->ai_family == AF_INET6) {
-      struct sockaddr_in6& sin6 = ua->sin6;
-      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
-      sin6.sin6_family = AF_INET6;                     // IPv6
-      sin6.sin6_port = htons(ni.port);                 // port
-      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
-      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
-    } else {
-      WARN("Net : unsupported IP family");
-      return ncclInvalidArgument;
-    }
-
-    freeaddrinfo(p); // all done with this structure
-
-  } else {
-    int i, j = -1, len = strlen(ip_port_pair);
-    for (i = 1; i < len; i++) {
-      if (ip_port_pair[i] == '%') j = i;
-      if (ip_port_pair[i] == ']') break;
-    }
-    if (i == len) {
-      WARN("Net : No valid [IPv6]:port pair found");
-      return ncclInvalidArgument;
-    }
-    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
-
-    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
-    memset(ip_str, '\0', sizeof(ip_str));
-    memset(port_str, '\0', sizeof(port_str));
-    memset(if_name, '\0', sizeof(if_name));
-    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
-    strncpy(port_str, ip_port_pair+i+2, len-i-1);
-    int port = atoi(port_str);
-    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
-
-    struct sockaddr_in6& sin6 = ua->sin6;
-    sin6.sin6_family = AF_INET6;                       // IPv6
-    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
-    sin6.sin6_port = htons(port);                      // port
-    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
-    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
-  }
-  return ncclSuccess;
-}
-
-static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
-  static int shownIfName = 0;
-  int nIfs = 0;
-  // Allow user to force the INET socket family selection
-  int sock_family = envSocketFamily();
-  // User specified interface
-  char* env = getenv("NCCL_SOCKET_IFNAME");
-  if (env && strlen(env) > 1) {
-    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
-    // Specified by user : find or fail
-    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
-    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-  } else {
-    // Try to automatically pick the right one
-    // Start with IB
-    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    // else see if we can get some hint from COMM ID
-    if (nIfs == 0) {
-      char* commId = getenv("NCCL_COMM_ID");
-      if (commId && strlen(commId) > 1) {
-	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
-	// Try to find interface that is in the same subnet as the IP in comm id
-        union socketAddress idAddr;
-        GetSocketAddrFromString(&idAddr, commId);
-        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
-      }
-    }
-    // Then look for anything else (but not docker or lo)
-    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    // Finally look for docker, then lo.
-    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-  }
-  return nIfs;
-}
-
-static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
-  /* IPv4/IPv6 support */
-  int family = localAddr->sa.sa_family;
-  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
-  /* Create socket and bind it to a port */
-  int sockfd = socket(family, SOCK_STREAM, 0);
-  if (sockfd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
-  }
-
-#if defined(RCCL_IB_TEST)
-  static int port = 23456;
-  localAddr->sin.sin_port = htons(port++);
-#endif
-
-  if (socketToPort(localAddr)) {
-    // Port is forced by env. Make sure we get the port.
-    int opt = 1;
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
-    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
-#endif
-  }
-
-  // localAddr port should be 0 (Any port)
-  SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
-
-  /* Get the assigned Port */
-  socklen_t size = salen;
-  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
-
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(localAddr, line));
-#endif
-
-  /* Put the socket in listen mode
-   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
-   */
-  SYSCHECK(listen(sockfd, 16384), "listen");
-  *fd = sockfd;
-  return ncclSuccess;
-}
-
-static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
-  char line[SOCKET_NAME_MAXLEN+1];
-  /* IPv4/IPv6 support */
-  int family = remoteAddr->sa.sa_family;
-  if (family != AF_INET && family != AF_INET6) {
-    WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
-         socketToString(remoteAddr, line), family, AF_INET, AF_INET6);
-    return ncclInternalError;
-  }
-  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
-  /* Connect to a hostname / port */
-  *fd = socket(family, SOCK_STREAM, 0);
-  if (*fd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
-  }
-
-  const int one = 1;
-  SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
-  /*  const int bufsize = 128*1024;
-    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
-    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
-
-  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(remoteAddr, line));
-
-  int ret;
-  int timedout_retries = 0;
-  int refused_retries = 0;
-retry:
-  SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
-  if (ret == 0) return ncclSuccess;
-  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
-    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-      if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
-      usleep(SLEEP_INT);
-      goto retry;
-    }
-  }
-  WARN("Net : Connect to %s failed : %s", socketToString(remoteAddr, line), strerror(errno));
-  return ncclSystemError;
-}
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
+ncclResult_t ncclSocketListen(struct ncclSocket* sock);
+// Connect to sock->addr. sock->fd is set after a successful call.
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
+// Return socket connection state.
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
+// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
 
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgressOpt(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset, int block) {
-  int bytes = 0;
-  char* data = (char*)ptr;
-  char line[SOCKET_NAME_MAXLEN+1];
-  do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_RECV && bytes == 0) {
-      WARN("Net : Connection closed by remote peer %s", socketToString(addr, line));
-      return ncclSystemError;
-    }
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("Net : Call to recv from %s failed : %s", socketToString(addr, line), strerror(errno));
-        return ncclSystemError;
-      } else {
-        bytes = 0;
-      }
-    }
-    (*offset) += bytes;
-  } while (bytes > 0 && (*offset) < size);
-  return ncclSuccess;
-}
-
-static ncclResult_t socketProgress(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
-  return socketProgressOpt(op, fd, addr, ptr, size, offset, 0);
-}
-
-static ncclResult_t socketWait(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
-  while (*offset < size)
-    NCCLCHECK(socketProgressOpt(op, fd, addr, ptr, size, offset, 1));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketSend(int fd, union socketAddress *addr, void* ptr, int size) {
-  int offset = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, fd, addr, ptr, size, &offset));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketRecv(int fd, union socketAddress *addr, void* ptr, int size) {
-  int offset = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, addr, ptr, size, &offset));
-  return ncclSuccess;
-}
 
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
+/* initialize a socket. */
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
 #endif
diff --git a/src/include/timer.h b/src/include/timer.h
new file mode 100644
index 0000000000..284fec6e05
--- /dev/null
+++ b/src/include/timer.h
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TIMER_H_
+#define NCCL_TIMER_H_
+#if ENABLE_TIMER
+#include <unistd.h>
+#include <sys/time.h>
+#include <x86intrin.h>
+static double freq = -1;
+static void calibrate() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  uint64_t timeCycles = __rdtsc();
+  double time = - tv.tv_sec*1E6 - tv.tv_usec;
+  uint64_t total = 0ULL;
+  for (int i=0; i<10000; i++) total += __rdtsc();
+  gettimeofday(&tv, NULL);
+  timeCycles = __rdtsc() - timeCycles;
+  time += tv.tv_sec*1E6 + tv.tv_usec;
+  freq = timeCycles/time;
+}
+static inline double gettime() {
+  if (freq == -1) calibrate();
+  return __rdtsc()/freq;
+}
+static uint64_t counts[8];
+static double times[8];
+static double startTimes[8];
+#define TIME_START(index) do { \
+  counts[index]++; \
+  startTimes[index] = gettime(); \
+} while (0);
+
+#define TIME_STOP(index) do { \
+  times[index] += gettime() - startTimes[index]; \
+} while (0);
+
+#define TIME_CANCEL(index) do { \
+  counts[index]--; \
+} while (0);
+
+#define TIME_PRINT(name) do { \
+  printf("%s stats", name); \
+  for (int i=0; i<8; i++) { \
+    if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
+    counts[i] = 0; \
+  } \
+  printf("\n"); \
+} while (0);
+#else
+#define TIME_START(index) while(0);
+#define TIME_STOP(index) while(0);
+#define TIME_CANCEL(index) while(0);
+#define TIME_PRINT(name)
+#endif
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index bd18ac4b0d..25675cf79c 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,12 +12,14 @@
 #include "graph.h"
 #include "nvmlwrap.h"
 #include "core.h"
-#include "proxy.h"
 
-#define NTRANSPORTS 3
+#define NTRANSPORTS 4
 #define TRANSPORT_P2P 0
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
+#define TRANSPORT_COLLNET 3
+
+#include "proxy.h"
 
 extern struct ncclTransport ncclTransports[];
 
@@ -28,12 +31,15 @@ struct ncclComm;
 struct ncclPeerInfo {
   int rank;
   int cudaDev;
+  int netDev;
   int gdrSupport;
   bool hasFineGrain;
   uint64_t hostHash;
   uint64_t pidHash;
   dev_t shmDev;
   int64_t busId;
+  struct ncclComm* comm;
+  int cudaCompCap;
 };
 
 #define CONNECT_SIZE 128
@@ -44,8 +50,12 @@ struct ncclConnect {
 struct ncclTransportComm {
   ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
   ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
-  ncclResult_t (*free)(void*);
-  ncclResult_t (*proxy)(struct ncclProxyArgs*);
+  ncclResult_t (*free)(struct ncclConnector*);
+  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
+  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
+  ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
 };
 
 struct ncclTransport {
diff --git a/src/include/utils.h b/src/include/utils.h
index 739a774e14..f08ff3731d 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #define NCCL_UTILS_H_
 
 #include "nccl.h"
+#include "checks.h"
 #include <stdint.h>
 
 int ncclCudaCompCap();
@@ -94,6 +95,11 @@ class ncclRecyclableList {
     return rv;
   }
 
+  T* peakNext() {
+    if (cursor == NULL || cursor == tail) return NULL;
+    return &cursor->data;
+  }
+
   // Recycle the list without freeing the space
   void recycle() {
     tail = cursor = head;
diff --git a/src/init.cc b/src/init.cc
index 60d74c8867..317b58bcd0 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -30,8 +30,8 @@
 #include "graph/topo.h"
 
 // [RCCL]
-#include "clique/CliqueManager.h"
-#include <hsa/hsa_ext_amd.h>
+//#include "clique/CliqueManager.h"
+//#include <hsa/hsa_ext_amd.h>
 // [/RCCL]
 
 #define STR2(v) #v
@@ -56,93 +56,8 @@ const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "
 NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-
-ncclNet_t* ncclNet = NULL;
-ncclCollNet_t* ncclCollNet = NULL;
-
 struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
 
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
-  int ndev;
-  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) return ncclSystemError;
-  return ncclSuccess;
-}
-
-ncclResult_t initCollNet(ncclCollNet_t* collnet) {
-  int ndev;
-  if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) return ncclSystemError;
-  return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
-  char ncclNetPluginName[128];
-  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envPluginName && strlen(envPluginName)) {
-    snprintf(ncclNetPluginName, 128, "librccl-net-%s.so", envPluginName);
-    INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
-  } else {
-    sprintf(ncclNetPluginName, "librccl-net.so");
-  }
-  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
-  if (netPluginLib == NULL) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
-    // string, so checking errno doesn't hurt to try to provide a better
-    // error message
-    if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
-    }
-    return ncclSuccess;
-  }
-  *net = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
-  if (*net == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
-    if (netPluginLib != NULL) dlclose(netPluginLib);
-    return ncclSuccess;
-  }
-  // Check for CollNet
-  *collnet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
-  if (*collnet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t initNet() {
-  // Always initialize bootstrap network
-  NCCLCHECK(bootstrapNetInit());
-
-  // Initialize main communication network
-  ncclNet_t* nets[3] = { NULL, &ncclNetIb, &ncclNetSocket };
-  ncclCollNet_t* collNets[3] = { NULL, NULL, NULL };
-  NCCLCHECK(initNetPlugin(nets+0, collNets+0));
-  char* netName = getenv("NCCL_NET");
-
-  for (int i=0; i<3; i++) {
-    if (nets[i] == NULL) continue;
-    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
-    // net plugin is already initialized
-    if (initNet(nets[i]) != ncclSuccess) continue;
-    ncclNet = nets[i];
-    if (collNets[i] && initCollNet(collNets[i]) == ncclSuccess) {
-      ncclCollNet = collNets[i];
-    }
-    break;
-  }
-
-  if (ncclNet == NULL) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -156,7 +71,8 @@ ncclResult_t initGdrCopy() {
   return ncclSuccess;
 }
 
-NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+
+NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
@@ -168,7 +84,9 @@ static ncclResult_t ncclInit() {
     initEnv();
     initGdrCopy();
     maxLocalSizeBytes = ncclKernMaxLocalSize();
-    NCCLCHECK(initNet());
+    int carveout = ncclParamL1SharedMemoryCarveout();
+    if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
+    NCCLCHECK(ncclNetInit());
     INFO(NCCL_INIT, "Using network %s", ncclNetName());
     initialized = true;
   }
@@ -208,7 +126,7 @@ void *ncclCommThreadMain(void *arg) {
   ncclComm_t comm = (ncclComm_t)arg;
   int head = comm->hostDevComm.collTraceHead;
   #define MAX_NAME_LENGTH 64
-  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+1));
+  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
   for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
     for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) {
       for (int type = 0; type < ncclNumTypes; type++) {
@@ -228,6 +146,8 @@ void *ncclCommThreadMain(void *arg) {
   }
   char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX_P2P;
   sprintf(line, "SendRecvRingSimpleSum_i8");
+  line += MAX_NAME_LENGTH;
+  sprintf(line, "AllToAllPivotRingSimpleSum_i8");
   do {
     int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
     int count;
@@ -257,40 +177,43 @@ void *ncclCommThreadMain(void *arg) {
           (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid,
           fIdx, td->data_0, td->opCount, td->data_1);
       } else {
-        sprintf(line, "## [%12.6f] [%02d:%02d] %06lx",
-          (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, fIdx == FUNC_INDEX_P2P ? (td->opCount + 0x100000): td->opCount);
+        if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType)
+          sprintf(line, "## [%12.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
+        else
+          sprintf(line, "## [%12.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount);
         offset = strlen(line);
-        switch (type) {
-          case ncclCollTraceKernelLaunchType:
-            sprintf(line+offset, " KL HWID %8x %s ",
-              td->data_0, func_names+MAX_NAME_LENGTH*fIdx);
-            offset = strlen(line);
-            if (fIdx > FUNC_INDEX_P2P)
-              sprintf(line+offset, "ERROR bad function index %d", fIdx);
-            else if (fIdx == FUNC_INDEX_P2P)
-              sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
-            else
-              sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
-            break;
-          case ncclCollTraceCollEndType:
-            sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx);
-            offset = strlen(line);
-            if (fIdx > FUNC_INDEX_P2P)
-              sprintf(line+offset, "ERROR bad function index %d", fIdx);
-            else if (fIdx == FUNC_INDEX_P2P)
-              sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
-            else
-              sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
-            break;
-          case ncclCollTraceKernelEndType:
-            sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks);
-            break;
-          case ncclCollTraceAbortType:
-            sprintf(line+offset, " Abort");
-            break;
-          default:
-            sprintf(line+offset, " unknown collective trace data type");
-            break;
+        if (type == ncclCollTraceCollElemType) {
+          sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
+        } else if (type == ncclCollTraceP2pElemType) {
+          sprintf(line+offset, " PE %s %d -> %d/%d/%d/%d conn/nw/ws/ng %d/%d/%d/%d -> %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx,
+            td->p2p[0].peer, td->p2p[0].connIndex, td->p2p[0].nWarps, td->p2p[0].warpStart, td->p2p[0].ngroups,
+            td->p2p[1].connIndex, td->p2p[1].nWarps, td->p2p[1].warpStart, td->p2p[1].ngroups, td->p2p[1].peer, comm->busId, comm->nRanks);
+        } else {
+          switch (type&0xf) {
+            case ncclCollTraceKernelLaunchType:
+            case ncclCollTraceCollLaunchType:
+              if ((type&0xf) == ncclCollTraceKernelLaunchType)
+                sprintf(line+offset, " KL HWID %8x %s", td->data_0, func_names+MAX_NAME_LENGTH*fIdx);
+              else if ((type&0xf) == ncclCollTraceCollLaunchType)
+                sprintf(line+offset, " CL %s", func_names+MAX_NAME_LENGTH*fIdx);
+              offset = strlen(line);
+              if ((type&0xf0) == ncclCollTraceCollElemType)
+                sprintf(line+offset, " nw %d bi %d nc %d busId %lx nRanks %d", td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
+              else if ((type&0xf0) == ncclCollTraceP2pElemType)
+                sprintf(line+offset, " %d -> %d/%d/%d/%d conn/nw/ws/ng %d/%d/%d/%d -> %d busId %lx nRanks %d",
+                  td->p2p[0].peer, td->p2p[0].connIndex, td->p2p[0].nWarps, td->p2p[0].warpStart, td->p2p[0].ngroups,
+                  td->p2p[1].connIndex, td->p2p[1].nWarps, td->p2p[1].warpStart, td->p2p[1].ngroups, td->p2p[1].peer, comm->busId, comm->nRanks);
+              break;
+            case ncclCollTraceKernelEndType:
+              sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks);
+              break;
+            case ncclCollTraceAbortType:
+              sprintf(line+offset, " Abort");
+              break;
+            default:
+              sprintf(line+offset, " unknown collective trace data type");
+              break;
+          }
         }
       }
       INFO(NCCL_COLL, "%s", line);
@@ -311,6 +234,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  // First stop all threads before we free anything.
+  NCCLCHECK(ncclProxyDestroy(comm));
+
   delete[] comm->userRedOps;
 
   free(comm->connectSend);
@@ -324,96 +250,38 @@ static ncclResult_t commFree(ncclComm_t comm) {
   free(comm->asyncOps);
 
 #ifdef ENABLE_PROFILING
-#ifdef ENABLE_TIMING_PROFILE
-  struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
-  CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
+  struct ncclProf prof;
+  prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS);
+  CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost));
   #define VEGA_GPU_RTC_FREQUENCY 2.5E7
   if (comm->rank == 0) {
-    INFO(NCCL_INIT, "# %8s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Rank:Ch", "total", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
-    INFO(NCCL_INIT, "# %8s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)");
+    INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", "  wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
+    INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", "   (s)", "   (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
   }
-  for (int chan=0; chan<comm->nChannels; chan++) {
-    INFO(NCCL_INIT, "# [%03d:%02d] %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f",
-      comm->rank, chan, (double)prof->elems[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0,
-      (double)prof->elems[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0);
+  for (int i = 1; i < PROFILE_NUM_ITEMS; i++) {
+    int valid = 0;
+    for (int chan=0; chan<comm->nChannels; chan++) {
+      struct ncclProfElem *elem = prof.elems+i;
+      if (elem->elem[chan].opCount == 0)
+        continue;
+      valid++;
+      INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
+        comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY,
+        (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY,
+        (elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+        (elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
+    }
+    if (valid == 0)
+      break;
   }
-  free(prof);
-  CUDACHECK(hipFree(comm->hostDevComm.devProf));
-
-  for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
-    if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %7.3f ms (%ld bytes %d measurements)",
-      comm->rank, channel, (float)comm->channels[channel].bw_cumulative,
-      comm->channels[channel].send_byte, comm->channels[channel].bw_count);
-    if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %7.3f ms (%ld bytes %d measurements)",
-      comm->rank, channel, (float)comm->channels[channel].bw_cumulative,
-      comm->channels[channel].recv_byte, comm->channels[channel].bw_count);
-  }
-#else
-  struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
-  CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
-  uint64_t total_cycle = 0, wait_cycle = 0, wait_send_cycle = 0, wait_recv_cycle = 0, send_cycle = 0, directSend_cycle = 0, recv_cycle = 0, \
-    directRecv_cycle = 0, copySend_cycle = 0, directCopySend_cycle = 0, recvCopySend_cycle = 0, directRecvCopySend_cycle = 0, \
-    recvReduceCopy_cycle = 0, recvReduceSend_cycle = 0, recvReduceCopySend_cycle = 0, directRecvReduceCopySend_cycle = 0, \
-    send_byte = 0, directSend_byte = 0, recv_byte = 0, directRecv_byte = 0, copySend_byte = 0, directCopySend_byte = 0, \
-    recvCopySend_byte = 0, directRecvCopySend_byte = 0, recvReduceCopy_byte = 0, recvReduceSend_byte = 0, \
-    recvReduceCopySend_byte = 0, directRecvReduceCopySend_byte = 0;
-  for (int chan=0; chan<comm->nChannels; chan++) {
-    total_cycle += prof->elems[chan].total_cycle;
-    wait_cycle += prof->elems[chan].wait_cycle;
-    wait_send_cycle += prof->elems[chan].wait_send_cycle;
-    wait_recv_cycle += prof->elems[chan].wait_recv_cycle;
-    send_cycle += prof->elems[chan].send_cycle;
-    directSend_cycle += prof->elems[chan].directSend_cycle;
-    recv_cycle += prof->elems[chan].recv_cycle;
-    directRecv_cycle += prof->elems[chan].directRecv_cycle;
-    copySend_cycle += prof->elems[chan].copySend_cycle;
-    directCopySend_cycle += prof->elems[chan].directCopySend_cycle;
-    recvCopySend_cycle += prof->elems[chan].recvCopySend_cycle;
-    directRecvCopySend_cycle += prof->elems[chan].directRecvCopySend_cycle;
-    recvReduceCopy_cycle += prof->elems[chan].recvReduceCopy_cycle;
-    recvReduceSend_cycle += prof->elems[chan].recvReduceSend_cycle;
-    recvReduceCopySend_cycle += prof->elems[chan].recvReduceCopySend_cycle;
-    directRecvReduceCopySend_cycle += prof->elems[chan].directRecvReduceCopySend_cycle;
-    send_byte += prof->elems[chan].send_byte;
-    directSend_byte += prof->elems[chan].directSend_byte;
-    recv_byte += prof->elems[chan].recv_byte;
-    directRecv_byte += prof->elems[chan].directRecv_byte;
-    copySend_byte += prof->elems[chan].copySend_byte;
-    directCopySend_byte += prof->elems[chan].directCopySend_byte;
-    recvCopySend_byte += prof->elems[chan].recvCopySend_byte;
-    directRecvCopySend_byte += prof->elems[chan].directRecvCopySend_byte;
-    recvReduceCopy_byte += prof->elems[chan].recvReduceCopy_byte;
-    recvReduceSend_byte += prof->elems[chan].recvReduceSend_byte;
-    recvReduceCopySend_byte += prof->elems[chan].recvReduceCopySend_byte;
-    directRecvReduceCopySend_byte += prof->elems[chan].directRecvReduceCopySend_byte;
-  }
-  #define VEGA_GPU_RTC_FREQUENCY 2.5E7
-  if (comm->rank == 0) {
-    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "  wait", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
-    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
-  }
-  INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
-    comm->rank, (double)total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
-    (double)wait_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
-    (double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
-    (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
-    (send_cycle) ? (double)send_byte*comm->nChannels/((double)send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (recvReduceSend_cycle) ? (double)recvReduceSend_byte*comm->nChannels/((double)recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (directRecvReduceCopySend_cycle) ? (double)directRecvReduceCopySend_byte*comm->nChannels/((double)directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (directRecvCopySend_cycle) ? (double)directRecvCopySend_byte*comm->nChannels/((double)directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (directRecv_cycle) ? (double)directRecv_byte*comm->nChannels/((double)directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (copySend_cycle) ? (double)copySend_byte*comm->nChannels/((double)copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (recv_cycle) ? (double)recv_byte*comm->nChannels/((double)recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-    (recvCopySend_cycle) ? (double)recvCopySend_byte*comm->nChannels/((double)recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
-  free(prof);
-  CUDACHECK(hipFree(comm->hostDevComm.devProf));
+  free(prof.elems);
+  CUDACHECK(hipFree(comm->hostDevComm.devProf.elems));
 
   for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
     if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
@@ -426,7 +294,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
       comm->channels[channel].recv_byte, comm->channels[channel].bw_count);
   }
 #endif
-#endif
 
 #ifdef ENABLE_COLLTRACE
   STORE(&comm->hostDevComm.collTraceExit, 1);
@@ -437,6 +304,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   free(comm->peerInfo);
   ncclTopoFree(comm->topo);
+  for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
+  free(comm->nodeRanks);
+  free(comm->rankToNode);
+  free(comm->rankToLocalRank);
 
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -460,8 +331,16 @@ static ncclResult_t commFree(ncclComm_t comm) {
   int isLast;
   NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
   if (isLast) {
+    // Wait for all service threads to be done. We could not
+    // do it earlier because it could have blocked and prevented
+    // other ranks in the process to call ncclCommDestroy
+    for (int i=0; i<comm->intraRanks; i++) {
+      void* ret;
+      if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret);
+    }
     free(comm->intraBarrier);
     free(comm->intraParams);
+    free(comm->intraThreads);
     free(comm->intraCudaDevs);
     free(comm->intraCGMode);
     free(comm->intraCC);
@@ -525,9 +404,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->collOpCount = 0;
   comm->p2pOpCount = 0;
 
-  comm->argsptr = &comm->args;
+  comm->argsptrs[0] = &comm->devComm;
+  comm->argsptrs[1] = &comm->args;
 #ifdef ENABLE_PROFILING
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1));
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS));
 #endif
 
 #ifdef ENABLE_COLLTRACE
@@ -546,11 +426,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->asyncOpCount = 0;
   comm->asyncTotalSize = 0;
   comm->channelSize = ncclParamAggChannelSize();
-  comm->asyncAllocMode = ncclComm::ROUND_ROBIN;
+  comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE;
   char* str = getenv("NCCL_AGG_ALLOC_MODE");
   if (str) INFO(NCCL_ENV, "NCCL_AGG_ALLOC_MODE set by environment to %s", str);
-  if (str && strcmp(str, "SHORTEST_QUEUE") == 0) {
-    comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE;
+  if (str && strcmp(str, "ROUND_ROBIN") == 0) {
+    comm->asyncAllocMode = ncclComm::ROUND_ROBIN;
   }
 
   CUDACHECK(hipDriverGetVersion(&comm->driverVersion));
@@ -577,13 +457,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
   NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
 
-  // Create a map between global rank and intra-node rank
-  NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks));
-  memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0]));
-
   // Mark channels as non initialized.
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
 
+  CUDACHECK(hipDeviceGetAttribute(&comm->WarpSize, hipDeviceAttributeWarpSize, comm->cudaDev));
   *comret = comm;
   return ncclSuccess;
 }
@@ -652,6 +529,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
     info->gdrSupport = 0;
   }
 
+  info->comm = comm;
+  info->cudaCompCap = ncclCudaCompCap();
   return ncclSuccess;
 }
 
@@ -681,7 +560,7 @@ void* waitForNonNullPtr(void* p) {
 
 ncclResult_t initParams(struct ncclComm* comm) {
   hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = (void **)&comm->argsptr;
+  params->args = (void **)&comm->argsptrs;
   params->stream = NULL;
   params->sharedMem = 0;
   params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -703,6 +582,7 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
     bar[0] = bar[1] = 0;
     comm->intraBarrier = bar;
     NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks));
     NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
     int* CGMode;
     NCCLCHECK(ncclCalloc(&CGMode, 1));
@@ -715,11 +595,13 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
   } else {
     comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
     comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads);
     comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
     comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
     comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
   }
   comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  comm->intraThreads[comm->intraRank] = comm->proxyState.thread;
   NCCLCHECK(initParams(comm));
 
   int cgMdLaunch = 1;
@@ -777,10 +659,9 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
 NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
-NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
+NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0);
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
   // We use 2 AllGathers
@@ -793,75 +674,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
   // [RCCL] Collect the PID of the root
   int rootPid;
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap, &rootPid));
+  NCCLCHECK(bootstrapInit(commId, comm));
   // [/RCCL]
 
   // AllGather1 - begin
-  struct {
-    struct ncclPeerInfo peerInfo;
-    struct ncclComm* comm;
-    int cudaCompCap;
-  } *allGather1Data;
-
-  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
-  allGather1Data[rank].comm = comm;
-  allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
-  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
-  NCCLCHECK(fillInfo(comm, myInfo, commHash));
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
   NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
+  NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, commHash));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
+
   for (int i = 0; i < nranks; i++) {
-    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
-    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
       return ncclInvalidUsage;
     }
   }
 
-  // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
-  int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-  int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
-  int myCompCap = allGather1Data[rank].cudaCompCap;
-  int minCompCap = myCompCap, maxCompCap = myCompCap;
-  for (int i = 0; i < nranks; i++) {
-    if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
-      // Rank is on same node
-      if (intraNodeRanks == 0) intraNodeRank0 = i;
-      if (i == rank) intraNodeRank = intraNodeRanks;
-      comm->intraNodeGlobalRanks[intraNodeRanks] = i;
-      comm->rankToIntraNodeRank[i] = intraNodeRanks;
-      intraNodeRanks++;
-      if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
-        // Rank is in same process
-        if (intraProcRanks == 0) intraProcRank0 = i;
-        if (i == rank) intraProcRank = intraProcRanks;
-        intraProcRanks++;
-      }
-    }
-    minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
-    maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0);
-  TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-        rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0);
-  if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) {
-    WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraProcRank, intraProcRanks, intraProcRank0);
-    return ncclInternalError;
-  }
-  if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) {
-    WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraNodeRank, intraNodeRanks, intraNodeRank0);
-    return ncclInternalError;
-  }
-  struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
-  uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
-  comm->intraNodeRank = intraNodeRank;
-
   // AllGather1 - end
 
   // Topo detection / System graph creation
@@ -884,11 +711,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Print final topology
   NCCLCHECK(ncclTopoPrint(comm->topo));
 
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
+  cpu_set_t affinitySave;
+  if (CPU_COUNT(&comm->cpuAffinity)) {
+    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  }
+  ncclResult_t ret;
+
+  // Launch proxy service thread
+  NCCLCHECK(ncclProxyCreate(comm));
+
   // Get rings and trees
   struct ncclTopoGraph ringGraph;
   ringGraph.id = 0;
   ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.crossNic = ncclParamCrossNic();
   ringGraph.collNet = 0;
   ringGraph.minChannels = 1;
   ringGraph.maxChannels = MAXCHANNELS/2;
@@ -898,7 +737,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   struct ncclTopoGraph treeGraph;
   treeGraph.id = 1;
   treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
   treeGraph.collNet = 0;
   treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
   treeGraph.maxChannels = ringGraph.nChannels;
@@ -909,39 +747,36 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   collNetGraph.id = 2;
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
-  collNetGraph.crossNic = ncclParamCrossNic();
-  collNetGraph.minChannels = 1;
-  collNetGraph.maxChannels = ringGraph.nChannels;
+  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
   NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
   NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
 
-  bool allXgmi = true;
+  bool allXgmi = true, hasPeerAccess = true;
+  // Check that all the GPUs have peer access to one another and are XGMI connected
+  for (int i = 0; i < nranks && hasPeerAccess; i++) {
+    int cudaDev1 = comm->peerInfo[i].cudaDev;
+    for (int j = 0; j < nranks; j++) {
+      if (i == j) continue;
+      int cudaDev2 = comm->peerInfo[j].cudaDev;
+      int p2p;
+      if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
+      {
+        hasPeerAccess = false;
+        break;
+      }
+
+      bool isXGMI;
+      // Limit to single intermediate GPU for enabling clique
+      NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1));
+      allXgmi &= isXGMI;
+    }
+  }
+
+#if 0
   { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
     CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
     if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)
     {
-      // Check that all the GPUs have peer access to one another and are XGMI connected
-      bool hasPeerAccess = true;
-      for (int i = 0; i < nranks && hasPeerAccess; i++)
-      {
-        int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
-        for (int j = 0; j < nranks; j++)
-        {
-          if (i == j) continue;
-          int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
-          int p2p;
-          if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
-          {
-            hasPeerAccess = false;
-            break;
-          }
-
-          bool isXGMI;
-          // Limit to single intermediate GPU for enabling clique
-          NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1));
-          allXgmi &= isXGMI;
-        }
-      }
       if (hasPeerAccess)
       {
         if (intraProcRanks == nranks)
@@ -960,6 +795,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
     NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
   } // [/RCCL]
+#endif
 
   if (comm->rank == ncclParamGraphDumpFileRank()) {
     struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
@@ -967,11 +803,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
 
   // Determine local CollNet support before all-gather
-  if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
-  if (intraNodeRanks > 8) {
-    if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
-    comm->collNetSupport = 0;
+  if (collNetSupport()) {
+    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    if (collNetEnable != NULL) {
+      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
+      if (strcmp(collNetEnable, "1") == 0) {
+        comm->collNetSupport = 1;
+      }
+    }
   }
+  if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;
 
   if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
     if (rcclParamP2pNetDisable() == 0) {
@@ -993,6 +834,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   };
 
   struct {
+    int netDev;
     int collNetSupport;
     int nc;
     struct ncclGraphInfo tree;
@@ -1004,7 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
   int idx;
-  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
     allGather3Data[rank].nc = 4;
@@ -1018,6 +860,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
+  NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
   allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -1052,19 +895,50 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int *nodesFirstRank, *nodesTreePatterns;
   NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
   NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
-  for (int i=0; i<nranks; i++) {
-    int node = -1;
-    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
-    for (int n=0; n<comm->nNodes; n++) {
-      if (nodesFirstRank[n] == firstRank) node = n;
-    }
-    if (node == -1) {
-      node = comm->nNodes++;
+  NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
+  for (int r=0; r<nranks; r++) {
+    int node;
+    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+    if (node == comm->nNodes) {
+      comm->nNodes++;
       nodesFirstRank[node] = firstRank;
       // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
+      nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
     }
-    if (i == comm->rank) comm->node = node;
+    comm->rankToNode[r] = node;
+  }
+  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+  NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
+  NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+    comm->nodeRanks[node].localRanks++;
+  }
+  // Allocate ranks arrays for each node
+  for (int n=0; n<comm->nNodes; n++) {
+    NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
+    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->nodeRanks[n].localRanks = 0;
+  }
+  // And fill the ranks arrays
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+  }
+  comm->node = comm->rankToNode[rank];
+  comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+  comm->localRank = comm->rankToLocalRank[rank];
+  comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+  TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+  if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+         rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+         comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+    return ncclInternalError;
   }
 
   int nChannelsOrig = comm->nChannels;
@@ -1072,6 +946,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
   int nc = allGather3Data[0].nc;
   for (int i=0; i<nranks; i++) {
+    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     nc = std::min(allGather3Data[i].nc, nc);
     // Make sure we align all ranks so that the tuning is consistent across ranks
@@ -1079,20 +954,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
     treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
     treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
-    treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
+    treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+    treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
     ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
     ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
     ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
-    ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
+    ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
     collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
     collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
     collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
     collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
-    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
-    collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
+    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
     comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
     comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
   }
@@ -1106,12 +981,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
   }
 
-  // Determine CollNet support after all-gather now that we know nNodes
-  int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
-  if (comm->nNodes < collNetNodeThreshold) {
-    if (comm->collNetSupport == 1)
+  // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+  if (comm->collNetSupport == 1) {
+    int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+    if (comm->nNodes < collNetNodeThreshold) {
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
-    comm->collNetSupport = 0;
+      comm->collNetSupport = 0;
+    }
+    for (int n=0; n<comm->nNodes; n++) {
+      if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
+        WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
+        comm->collNetSupport = 0;
+        break;
+      }
+    }
   }
 
   int *rings;
@@ -1121,7 +1004,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   free(allTopoRanks);
   free(nodesTreePatterns);
   free(nodesFirstRank);
-  free(allGather1Data);
   free(allGather3Data);
 
   // AllGather3 - end
@@ -1140,16 +1022,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
 
-  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
-  // on the host is local.
-  NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
-  cpu_set_t affinitySave;
-  if (CPU_COUNT(&comm->cpuAffinity)) {
-    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  }
-  ncclResult_t ret;
-
   NCCLCHECK(computeBuffSizes(comm));
 
   // Connect with prev/next for each ring
@@ -1186,7 +1058,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Check if we can setup CollNet
   if (comm->collNetSupport > 0) {
     int collNetSetupFail = 0;
-    int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
+    int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
     // Find all head ranks
     int nHeads = collNetGraph.nChannels;
     int *heads;
@@ -1200,7 +1072,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       for (int h=0; h<nHeads; h++) {
         const int head = heads[h];
         collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
-        if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
+        collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
       }
       // Verify CollNet setup across ranks after trying the first channel
       if (c == 0) {
@@ -1226,8 +1098,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
     // Exchange highest intra-node transport type among ranks
     // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-    comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
+    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
     for (int i=0; i<comm->localRanks; i++) {
       if (highestTypes[i] > comm->intraHighestTransportType)
         comm->intraHighestTransportType = highestTypes[i];
@@ -1245,7 +1117,15 @@ collnet_cleanup:
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
-  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+  do {
+    int myCompCap = comm->peerInfo[rank].cudaCompCap;
+    int minCompCap = myCompCap, maxCompCap = myCompCap;
+    for (int i = 0; i < nranks; i++) {
+      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
+      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
+    }
+    NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+  } while(0);
 
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
@@ -1260,28 +1140,67 @@ collnet_cleanup:
       int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
           comm->connectRecv[peer] |= (1<<channelId);
         }
       }
       delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
           comm->connectSend[peer] |= (1<<channelId);
         }
       }
     }
-    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
     free(nvbPeers);
   }
 
-  NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, intraProcRank0Comm));
+  // Connect to local net proxy
+  struct ncclProxyConnector proxyConn;
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
+  NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+
+  // Then to remote ones when using PXN
+  if (ncclPxnDisable() == 0) {
+    int nranks;
+    int* pxnPeers;
+    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
+    for (int r=0; r<nranks; r++) {
+      NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
+      NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+    }
+    free(pxnPeers);
+  }
+
+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+      }
+    }
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      return ncclInternalError;
+    }
+    NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+  } while(0);
 
   /* Local intra-node barrier */
-  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash));
+  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
 
-  if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
+  // Unlink proxy shm to make sure it will be properly cleaned up.
+  NCCLCHECK(ncclProxyShmUnlink(comm));
 
   // We should have allocated all buffers, collective fifos, ... we can
   // restore the affinity.
@@ -1301,15 +1220,16 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   CUDACHECK(hipSetDevice(cudaDev));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
-  //if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
-  //  TRACE(NCCL_INIT, "Setting hipLimitStackSize to %zi", maxLocalSizeBytes);
-  //  CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes));
-  //}
+  if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
+    TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
+    //CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes));
+  }
+  *newcomm = NULL;
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %ld used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, maxLocalSizeBytes, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
 
   return ncclSuccess;
 cleanup:
@@ -1397,6 +1317,12 @@ static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
 }
 
 static ncclResult_t commDestroy(ncclComm_t comm) {
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
   int savedDevice;
 #ifdef ENABLE_TRACE
   int rank = comm->rank;
@@ -1411,19 +1337,18 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, LOAD(comm->abortFlag), comm->fatalError);
 
   CUDACHECK(hipStreamSynchronize(comm->groupStream));
-  NCCLCHECK(ncclProxyDestroy(comm));
+
   ncclDestroyQueueInfo(comm->enqueueInfo);
 #if CUDART_VERSION >= 11030
   NCCLCHECK(ncclGraphHelperDestroy(comm));
 #endif
   INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
+
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
     CUDACHECK(hipSetDevice(savedDevice));
 
-  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
-
   return ncclSuccess;
 }
 
@@ -1433,19 +1358,17 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
-
-  // Try and prevent a double free of the comm struct (user error)
-  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
-    WARN("comm %p has already been destroyed", comm);
-    return ncclInvalidArgument;
-  }
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  int64_t busId = comm->busId;
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
 
   // [RCCL] Delete CliqueManager if it exists
-  if (comm->cliqueManager) delete comm->cliqueManager;
+  //if (comm->cliqueManager) delete comm->cliqueManager;
   // [/RCCL]
 
-  return commDestroy(comm);
+  NCCLCHECK(commDestroy(comm));
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
+  return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
@@ -1454,11 +1377,16 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  int64_t busId = comm->busId;
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+
   // Ask anything that might still be running on the device to quit
   *comm->abortFlag = 1;
 
   // do not destroy comm because kernel maybe still running
   // return commDestroy(comm);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
   return ncclSuccess;
 }
 
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index c39b06de6d..538336c873 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,7 +12,7 @@ static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, con
   hipPointerAttribute_t attr;
   hipError_t err = hipPointerGetAttributes(&attr, pointer);
   if (err != hipSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer);
     return ncclInvalidArgument;
   }
 #if CUDART_VERSION >= 10000
@@ -64,12 +64,9 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
 
   if (info->comm->checkPointers) {
-    if (info->coll == ncclFuncSendRecv) {
-      if (strcmp(info->opName, "Send") == 0) {
-        NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
-      } else {
-        NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv"));
-      }
+    if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) {
+      if (info->count >0)
+        NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
     } else {
       // Check CUDA device pointers
       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 439712e88f..e1aabac8f4 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -29,6 +29,7 @@ int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int at
 struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
 int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
 struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
 int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
 struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -65,7 +66,7 @@ ncclResult_t wrap_ibv_symbols(void) {
     }
   }
 
-#define LOAD_SYM(handle, symbol, funcptr) do {         \
+#define LOAD_SYM(handle, symbol, funcptr) do {           \
     cast = (void**)&funcptr;                             \
     tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
     if (tmp == NULL) {                                   \
@@ -75,6 +76,12 @@ ncclResult_t wrap_ibv_symbols(void) {
     *cast = tmp;                                         \
   } while (0)
 
+// Attempt to load a specific symbol version - fail silently
+#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
+    cast = (void**)&funcptr;                                     \
+    *cast = dlvsym(handle, symbol, version);                     \
+  } while (0)
+
   LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
   LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
   LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
@@ -89,6 +96,8 @@ ncclResult_t wrap_ibv_symbols(void) {
   LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
   LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
   LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
+  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
   LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
   LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
   LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -116,6 +125,7 @@ teardown:
   ibv_internal_alloc_pd = NULL;
   ibv_internal_dealloc_pd = NULL;
   ibv_internal_reg_mr = NULL;
+  ibv_internal_reg_mr_iova2 = NULL;
   ibv_internal_dereg_mr = NULL;
   ibv_internal_create_cq = NULL;
   ibv_internal_destroy_cq = NULL;
@@ -260,6 +270,14 @@ struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t len
   return ibv_internal_reg_mr(pd, addr, length, access);
 }
 
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
+  if (ibv_internal_reg_mr_iova2 == NULL) {
+    return ncclInternalError;
+  }
+  if (ret == NULL) { return ncclSuccess; } // Assume dummy call
+  IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
   IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
 }
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index e83392dcd5..5db7c6be5c 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -1,219 +1,262 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "nvmlwrap.h"
+#include "checks.h"
+#include "debug.h"
 
-#ifndef NVML_DIRECT
-#include <dlfcn.h>
-#include "core.h"
+#include <initializer_list>
+#include <memory>
+#include <mutex>
 
-static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized;
+int ncclNvmlDeviceCount = 0;
+ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
 
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
-    nvmlNvLinkCapability_t capability, unsigned int *capResult);
-static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
-
-// Used to make the NVML library calls thread safe
-pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t wrapNvmlSymbols(void) {
-  if (nvmlState == nvmlInitialized)
-    return ncclSuccess;
-  if (nvmlState == nvmlError)
-    return ncclSystemError;
-
-  if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) {
-    // Another thread raced in front of us. Wait for it to be done.
-    while (nvmlState == nvmlInitializing) pthread_yield();
-    return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError;
-  }
-
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-  if (!nvmlhandle) {
-    WARN("Failed to open libnvidia-ml.so.1");
-    goto teardown;
-  }
-
-#define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
-
-  nvmlState = nvmlInitialized;
-  return ncclSuccess;
-
-teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceGetNvLinkState = NULL;
-  nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
-  nvmlInternalDeviceGetNvLinkCapability = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  nvmlState = nvmlError;
-  return ncclSystemError;
-}
-
-
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
-  if (nvmlInternalDeviceGetNvLinkState == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
-  if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
-  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
-  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
+#if NCCL_NVML_DIRECT
+  #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name;
+#else
+  #include <dlfcn.h>
+  #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr;
 #endif
+
+namespace {
+  NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
+  NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
+  NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
+  NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device))
+  NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
+  NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
+  NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
+  NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
+
+  std::mutex lock; // NVML has had some thread safety bugs
+  bool initialized = false;
+  thread_local bool threadInitialized = false;
+  ncclResult_t initResult;
+}
+
+ncclResult_t ncclNvmlEnsureInitialized() {
+  // Optimization to avoid repeatedly grabbing the lock when we only want to
+  // read from the global tables.
+  if (threadInitialized) return initResult;
+  threadInitialized = true;
+
+  std::lock_guard<std::mutex> locked(lock);
+
+  if (initialized) return initResult;
+  initialized = true;
+
+  #if !NCCL_NVML_DIRECT
+  if (pfn_nvmlInit == nullptr) {
+    void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+    if (libhandle == nullptr) {
+      WARN("Failed to open libnvidia-ml.so.1");
+      initResult = ncclSystemError;
+      return initResult;
+    }
+
+    struct Symbol { void **ppfn; char const *name; };
+    std::initializer_list<Symbol> symbols = {
+      {(void**)&pfn_nvmlInit, "nvmlInit"},
+      {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
+      {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
+      {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
+      {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
+      {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
+      {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
+      {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
+      {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
+      {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
+      {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}
+    };
+    for(Symbol sym: symbols) {
+      *sym.ppfn = dlsym(libhandle, sym.name);
+    }
+  }
+  #endif
+
+  #if NCCL_NVML_DIRECT
+    bool have_v2 = true;
+  #else
+    bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null
+  #endif
+  nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
+  if (res1 != NVML_SUCCESS) {
+    WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
+    initResult = ncclSystemError;
+    return initResult;
+  }
+
+  unsigned int ndev;
+  res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
+  if (res1 != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1));
+    initResult = ncclSystemError;
+    return initResult;
+  }
+
+  ncclNvmlDeviceCount = int(ndev);
+  if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) {
+    WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices);
+    initResult = ncclInternalError;
+    return initResult;
+  }
+
+  for(int a=0; a < ncclNvmlDeviceCount; a++) {
+    res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle);
+    if (res1 != NVML_SUCCESS) {
+      WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+      initResult = ncclSystemError;
+      return initResult;
+    }
+
+    res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor);
+    if (res1 != NVML_SUCCESS) {
+      WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+      initResult = ncclSystemError;
+      return initResult;
+    }
+  }
+
+  for(int a=0; a < ncclNvmlDeviceCount; a++) {
+    for(int b=0; b < ncclNvmlDeviceCount; b++) {
+      nvmlDevice_t da = ncclNvmlDevices[a].handle;
+      nvmlDevice_t db = ncclNvmlDevices[b].handle;
+
+      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead);
+      if (res1 != NVML_SUCCESS) {
+        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+        initResult = ncclSystemError;
+        return initResult;
+      }
+
+      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite);
+      if (res1 != NVML_SUCCESS) {
+        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+        initResult = ncclSystemError;
+        return initResult;
+      }
+    }
+  }
+
+  initResult = ncclSuccess;
+  return initResult;
+}
+
+#define NVMLCHECK(name, ...) do { \
+  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+  if (e44241808 != NVML_SUCCESS) { \
+    WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+    return ncclSystemError; \
+  } \
+} while(0)
+
+#define NVMLTRY(name, ...) do { \
+  if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \
+    return ncclInternalError; /* missing symbol is not a warned error */ \
+  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+  if (e44241808 != NVML_SUCCESS) { \
+    if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \
+      INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+    return ncclSystemError; \
+  } \
+} while(0)
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  *device = ncclNvmlDevices[index].handle;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  for (int d=0; d < ncclNvmlDeviceCount; d++) {
+    if (ncclNvmlDevices[d].handle == device) {
+      *index = d;
+      return ncclSuccess;
+    }
+  }
+  return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(
+    nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability,
+    unsigned int *capResult
+  ) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+
+  for(int d=0; d < ncclNvmlDeviceCount; d++) {
+    if(device == ncclNvmlDevices[d].handle) {
+      *major = ncclNvmlDevices[d].computeCapabilityMajor;
+      *minor = ncclNvmlDevices[d].computeCapabilityMinor;
+      return ncclSuccess;
+    }
+  }
+  return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetP2PStatus(
+    nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,
+    nvmlGpuP2PStatus_t* p2pStatus
+  ) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+
+  if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
+    int a = -1, b = -1;
+    for(int d=0; d < ncclNvmlDeviceCount; d++) {
+      if(device1 == ncclNvmlDevices[d].handle) a = d;
+      if(device2 == ncclNvmlDevices[d].handle) b = d;
+    }
+    if (a == -1 || b == -1) return ncclInvalidArgument;
+    if (p2pIndex == NVML_P2P_CAPS_INDEX_READ)
+      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead;
+    else
+      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite;
+  }
+  else {
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/nvmlwrap_stub.cc b/src/misc/nvmlwrap_stub.cc
index 41485df101..a9462f0d04 100644
--- a/src/misc/nvmlwrap_stub.cc
+++ b/src/misc/nvmlwrap_stub.cc
@@ -7,50 +7,50 @@
 
 #include "nvmlwrap.h"
 
-ncclResult_t wrapNvmlSymbols(void) {
+ncclResult_t ncclNvmlSymbols(void) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlInit(void) {
+ncclResult_t ncclNvmlInit(void) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlShutdown(void) {
+ncclResult_t ncclNvmlShutdown(void) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
   return ncclSystemError;
 }
 
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   *index  = 0;
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+ncclResult_t ncclNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
   return ncclSystemError;
 }
 
-ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+ncclResult_t ncclNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
   *minorNumber = 0;
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
   return ncclSystemError;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
   return ncclSystemError;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult) {
   return ncclSystemError;
 }
 
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
   *major = *minor = 1;
   return ncclSuccess;
 }
diff --git a/src/misc/param.cc b/src/misc/param.cc
new file mode 100644
index 0000000000..a59713cf3b
--- /dev/null
+++ b/src/misc/param.cc
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "param.h"
+#include "debug.h"
+
+#include <algorithm>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pwd.h>
+
+const char* userHomeDir() {
+  struct passwd *pwUser = getpwuid(getuid());
+  return pwUser == NULL ? NULL : pwUser->pw_dir;
+}
+
+void setEnvFile(const char* fileName) {
+  FILE * file = fopen(fileName, "r");
+  if (file == NULL) return;
+
+  char *line = NULL;
+  char envVar[1024];
+  char envValue[1024];
+  size_t n = 0;
+  ssize_t read;
+  while ((read = getline(&line, &n, file)) != -1) {
+    if (line[read-1] == '\n') line[read-1] = '\0';
+    int s=0; // Env Var Size
+    while (line[s] != '\0' && line[s] != '=') s++;
+    if (line[s] == '\0') continue;
+    strncpy(envVar, line, std::min(1023,s));
+    envVar[s] = '\0';
+    s++;
+    strncpy(envValue, line+s, 1023);
+    envValue[1023]='\0';
+    setenv(envVar, envValue, 0);
+    //printf("%s : %s->%s\n", fileName, envVar, envValue);
+  }
+  if (line) free(line);
+  fclose(file);
+}
+
+void initEnv() {
+  char confFilePath[1024];
+  const char * userDir = userHomeDir();
+  if (userDir) {
+    sprintf(confFilePath, "%s/.rccl.conf", userDir);
+    setEnvFile(confFilePath);
+  }
+  sprintf(confFilePath, "/etc/rccl.conf");
+  setEnvFile(confFilePath);
+}
+
+void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
+  static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+  pthread_mutex_lock(&mutex);
+  if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
+    char* str = getenv(env);
+    int64_t value = deftVal;
+    if (str && strlen(str) > 0) {
+      errno = 0;
+      value = strtoll(str, nullptr, 0);
+      if (errno) {
+        value = deftVal;
+        INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
+      } else {
+        INFO(NCCL_ALL,"%s set by environment to %lld.", env, (long long)value);
+      }
+    }
+    __atomic_store_n(cache, value, __ATOMIC_RELAXED);
+  }
+  pthread_mutex_unlock(&mutex);
+}
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
new file mode 100644
index 0000000000..145b18fe8c
--- /dev/null
+++ b/src/misc/profiler.cc
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "profiler.h"
+
+//#define PROFILE_PROXY 1
+#ifdef PROFILE_PROXY
+#include "timer.h"
+#include "alloc.h"
+
+static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
+static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
+static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
+struct ncclProxyProfileEvent {
+  double timestamp[6];
+  uint64_t opCount;
+  int peer;
+  int step;
+  uint16_t channel;
+  uint8_t type; // send / recv
+  uint8_t opIndex;
+};
+
+struct ncclProxyProfileEvent* profilingEvents = NULL;
+int profilingIndex = 0;
+double profilingStart = 0;
+#define MAX_EVENTS 200000
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
+  if (profilingEvents == NULL) {
+    NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
+    profilingStart = gettime();
+  }
+  struct ncclProxyProfileEvent* event = NULL;
+  if (state%8 == 0) {
+    if (profilingIndex == MAX_EVENTS) return ncclSuccess;
+    args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
+    if (state == ncclProxyProfileBegin) {
+      // Proxy operation information
+      event->opCount = args->opCount;
+      event->channel = args->subs[sub].channelId;
+      event->peer = args->subs[sub].peer;
+      event->type = args->pattern;
+      event->step = step;
+      event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
+    } else event->peer = -state;
+  } else {
+    event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
+    if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
+    if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
+  }
+  // Timestamp
+  event->timestamp[state%8] = gettime()-profilingStart;
+  return ncclSuccess;
+}
+
+void ncclProfilingDump() {
+  static int dumpDone = 0;
+  if (dumpDone) return;
+  dumpDone = 1;
+  const char* str = getenv("NCCL_PROXY_PROFILE");
+  if (!str) { free(profilingEvents); return; }
+  FILE* f = fopen(str, "w");
+  fprintf(f, "[\n");
+
+  for (int i=0; i<profilingIndex; i++) {
+    struct ncclProxyProfileEvent* e = profilingEvents+i;
+    const int sendrecv = e->peer >= 0;
+    const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
+      profilingEventStr[-(e->peer/8)];
+
+
+    if (sendrecv) {
+      int state = ncclProxyProfileBegin;
+      const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
+      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
+          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
+
+      while (state<ncclProxyProfileEnd) {
+        if (e->timestamp[state]) {
+          const char* name = stateStr[state];
+          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+              name, i, e->channel, e->timestamp[state]);
+          state++;
+          while (e->timestamp[state] == 0) state++;
+          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+              name, i, e->channel, e->timestamp[state]);
+        }
+      }
+
+      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
+    } else {
+      if (e->peer == -ncclProxyProfileAppend) {
+      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
+          typeStr, i, e->timestamp[0], e->opCount);
+      } else {
+        fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, i, e->timestamp[0]);
+      }
+      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, i, e->timestamp[1]);
+    }
+  }
+  fprintf(f, "{} ]\n");
+  fclose(f);
+  free(profilingEvents);
+}
+#else
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
+void ncclProfilingDump() {}
+#endif
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
new file mode 100644
index 0000000000..a8a3c4e534
--- /dev/null
+++ b/src/misc/shmutils.cc
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "shm.h"
+#include "checks.h"
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+// Change functions behavior to match other SYS functions
+static int shm_allocate(int fd, const int shmSize) {
+  int err = posix_fallocate(fd, 0, shmSize);
+  if (err) { errno = err; return -1; }
+  return 0;
+}
+static int shm_map(int fd, const int shmSize, void** ptr) {
+  *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  return (*ptr == MAP_FAILED) ? -1 : 0;
+}
+
+static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
+  if (create) {
+    if (shmPath[0] == '\0') {
+      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+      *fd = mkstemp(shmPath);
+    } else {
+      SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+    }
+    if (ftruncate(*fd, shmSize) != 0) {
+      WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
+      return ncclSystemError;
+    }
+  } else {
+    SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+  }
+  *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
+  if (*ptr == NULL) {
+    WARN("Could not map %s\n", shmPath);
+    return ncclSystemError;
+  }
+  close(*fd);
+  *fd = -1;
+  if (create) memset(*ptr, 0, shmSize);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
+  int fd = -1;
+  void* ptr = MAP_FAILED;
+  ncclResult_t res = ncclSuccess;
+
+  NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
+  if (devShmPtr) {
+    CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, cudaError);
+    CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+  }
+
+  *shmPtr = ptr;
+  return ncclSuccess;
+sysError:
+  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
+cudaError:
+  if (fd != -1) close(fd);
+  if (create) shm_unlink(shmPath);
+  if (ptr != MAP_FAILED) munmap(ptr, shmSize);
+  *shmPtr = NULL;
+  return res;
+}
+
+ncclResult_t ncclShmUnlink(const char* shmPath) {
+  if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
+  if (devShmPtr) CUDACHECK(hipHostUnregister(shmPtr));
+  if (munmap(shmPtr, shmSize) != 0) {
+    WARN("munmap of shared memory failed");
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
new file mode 100644
index 0000000000..ef2bea65a5
--- /dev/null
+++ b/src/misc/socket.cc
@@ -0,0 +1,556 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "socket.h"
+#include "utils.h"
+#include <stdlib.h>
+
+#include <unistd.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+
+/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+  if (buf == NULL || addr == NULL) return NULL;
+  struct sockaddr *saddr = &addr->sa;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  char host[NI_MAXHOST], service[NI_MAXSERV];
+  /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
+   * (When not set, this will still happen in case the node's name cannot be determined.)
+   */
+  int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
+  (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
+  sprintf(buf, "%s<%s>", host, service);
+  return buf;
+}
+
+static uint16_t socketToPort(union ncclSocketAddress *addr) {
+  struct sockaddr *saddr = &addr->sa;
+  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static int envSocketFamily(void) {
+  int family = -1; // Family selection is not forced, will use first one found
+  char* env = getenv("NCCL_SOCKET_FAMILY");
+  if (env == NULL)
+    return family;
+
+  INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+
+  if (strcmp(env, "AF_INET") == 0)
+    family = AF_INET;  // IPv4
+  else if (strcmp(env, "AF_INET6") == 0)
+    family = AF_INET6; // IPv6
+  return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+#endif
+  struct netIf userIfs[MAX_IFS];
+  bool searchNot = prefixList && prefixList[0] == '^';
+  if (searchNot) prefixList++;
+  bool searchExact = prefixList && prefixList[0] == '=';
+  if (searchExact) prefixList++;
+  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
+
+    /* Allow the caller to force the socket family type */
+    if (sock_family != -1 && family != sock_family)
+      continue;
+
+    /* We also need to skip IPv6 loopback interfaces */
+    if (family == AF_INET6) {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    // check against user specified interfaces
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+      continue;
+    }
+
+    // Check that this interface has not already been saved
+    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+    bool duplicate = false;
+    for (int i = 0; i < found; i++) {
+      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+    }
+
+    if (!duplicate) {
+      // Store the interface name
+      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      // Store the IP address
+      int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memcpy(addrs+found, interface->ifa_addr, salen);
+      found++;
+    }
+  }
+
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
+  /* Check family first */
+  int family = local_if.ifa_addr->sa_family;
+  if (family != remote->sa.sa_family) {
+    return false;
+  }
+
+  if (family == AF_INET) {
+    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+    struct sockaddr_in& remote_addr = remote->sin;
+    struct in_addr local_subnet, remote_subnet;
+    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+  } else if (family == AF_INET6) {
+    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+    struct sockaddr_in6& remote_addr = remote->sin6;
+    struct in6_addr& local_in6 = local_addr->sin6_addr;
+    struct in6_addr& mask_in6 = mask->sin6_addr;
+    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+    bool same = true;
+    int len = 16;  //IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      if (c1 ^ c2) {
+        same = false;
+        break;
+      }
+    }
+    // At last, we need to compare scope id
+    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+    // For Global type, this field is 0, so a comparison wouldn't matter
+    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+    return same;
+  } else {
+    WARN("Net : Unsupported address family type");
+    return false;
+  }
+}
+
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+#endif
+  char line_a[SOCKET_NAME_MAXLEN+1];
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    // check against user specified interfaces
+    if (!matchSubnet(*interface, remoteAddr)) {
+      continue;
+    }
+
+    // Store the local IP address
+    int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+    memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+    // Store the interface name
+    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a));
+    found++;
+    if (found == maxIfs) break;
+  }
+
+  if (found == 0) {
+    WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a));
+  }
+  freeifaddrs(interfaces);
+  return found;
+}
+
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+    WARN("Net : string is null");
+    return ncclInvalidArgument;
+  }
+
+  bool ipv6 = ip_port_pair[0] == '[';
+  /* Construct the sockaddress structure */
+  if (!ipv6) {
+    struct netIf ni;
+    // parse <ip_or_hostname>:<port> string, expect one pair
+    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+      return ncclInvalidArgument;
+    }
+
+    struct addrinfo hints, *p;
+    int rv;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+      return ncclInvalidArgument;
+    }
+
+    // use the first
+    if (p->ai_family == AF_INET) {
+      struct sockaddr_in& sin = ua->sin;
+      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+      sin.sin_family = AF_INET;                        // IPv4
+      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port);                   // port
+    } else if (p->ai_family == AF_INET6) {
+      struct sockaddr_in6& sin6 = ua->sin6;
+      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+      sin6.sin6_family = AF_INET6;                     // IPv6
+      sin6.sin6_port = htons(ni.port);                 // port
+      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+    } else {
+      WARN("Net : unsupported IP family");
+      return ncclInvalidArgument;
+    }
+
+    freeaddrinfo(p); // all done with this structure
+
+  } else {
+    int i, j = -1, len = strlen(ip_port_pair);
+    for (i = 1; i < len; i++) {
+      if (ip_port_pair[i] == '%') j = i;
+      if (ip_port_pair[i] == ']') break;
+    }
+    if (i == len) {
+      WARN("Net : No valid [IPv6]:port pair found");
+      return ncclInvalidArgument;
+    }
+    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+
+    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+    memset(ip_str, '\0', sizeof(ip_str));
+    memset(port_str, '\0', sizeof(port_str));
+    memset(if_name, '\0', sizeof(if_name));
+    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    int port = atoi(port_str);
+    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+    struct sockaddr_in6& sin6 = ua->sin6;
+    sin6.sin6_family = AF_INET6;                       // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+    sin6.sin6_port = htons(port);                      // port
+    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+  }
+  return ncclSuccess;
+}
+
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  static int shownIfName = 0;
+  int nIfs = 0;
+  // Allow user to force the INET socket family selection
+  int sock_family = envSocketFamily();
+  // User specified interface
+  char* env = getenv("NCCL_SOCKET_IFNAME");
+  if (env && strlen(env) > 1) {
+    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
+    // Specified by user : find or fail
+    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
+    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  } else {
+    // Try to automatically pick the right one
+    // Start with IB
+    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // else see if we can get some hint from COMM ID
+    if (nIfs == 0) {
+      char* commId = getenv("NCCL_COMM_ID");
+      if (commId && strlen(commId) > 1) {
+	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+	// Try to find interface that is in the same subnet as the IP in comm id
+        union ncclSocketAddress idAddr;
+        ncclGetSocketAddrFromString(&idAddr, commId);
+        nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
+      }
+    }
+    // Then look for anything else (but not docker or lo)
+    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // Finally look for docker, then lo.
+    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  }
+  return nIfs;
+}
+
+ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
+  /* IPv4/IPv6 support */
+  int family = sock->addr.sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+  int flags;
+
+  /* Create socket and bind it to a port */
+  int fd = socket(family, SOCK_STREAM, 0);
+  if (fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (socketToPort(&sock->addr)) {
+    // Port is forced by env. Make sure we get the port.
+    int opt = 1;
+#if defined(SO_REUSEPORT)
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+#else
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#endif
+  }
+
+  /* make all new sockets non-blocking */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+  // addr port should be 0 (Any port)
+  SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
+
+  /* Get the assigned Port */
+  socklen_t size = salen;
+  SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
+#endif
+
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(fd, 16384), "listen");
+  sock->fd = fd;
+  return ncclSuccess;
+}
+
+static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
+    struct pollfd pfd;
+    int timeout = 1, ret;
+    socklen_t rlen = sizeof(int);
+
+    memset(&pfd, 0, sizeof(struct pollfd));
+    pfd.fd = fd;
+    pfd.events = POLLOUT;
+    SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
+    if (ret == 0) {
+      ret = EINPROGRESS;
+    } else {
+      /* check socket status */
+      EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
+      SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+    }
+
+    if (ret == EINPROGRESS)
+      *state = ncclSocketConnecting;
+    else if (ret == 0)
+      *state = ncclSocketConnected;
+    else
+      *state = ncclSocketError;
+    return ncclSuccess;
+}
+
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
+    NCCLCHECK(getFdState(sock->fd, state));
+    sock->state = *state;
+    return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+  char line[SOCKET_NAME_MAXLEN+1];
+  /* IPv4/IPv6 support */
+  int family = sock->addr.sa.sa_family;
+  if (family != AF_INET && family != AF_INET6) {
+    WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+         ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+    return ncclInternalError;
+  }
+  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+  int flags;
+
+  /* Connect to a hostname / port */
+  int fd = socket(family, SOCK_STREAM, 0);
+  if (fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  const int one = 1;
+  SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+  /* support non-blocking socket; by default, the socket is non-blocking */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+  /*  const int bufsize = 128*1024;
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+
+  int ret;
+  int timedout_retries = 0;
+  int refused_retries = 0;
+retry:
+  /* async connect; abort when error happens and abortFlag is present. */
+  ret = connect(fd, &sock->addr.sa, salen);
+
+  if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+    if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    usleep(SLEEP_INT);
+    goto retry;
+  } else if (errno == EINPROGRESS && !sock->asyncFlag) {
+    enum ncclSocketState state;
+    do {
+      if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
+      NCCLCHECK(getFdState(fd, &state));
+    } while (state == ncclSocketConnecting);
+    EQCHECK(state, ncclSocketError);
+    ret = 0;
+  }
+
+  if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+    sock->fd = fd;
+    return ncclSuccess;
+  }
+
+  WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+  return ncclSystemError;
+}
+
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
+  socklen_t socklen = sizeof(union ncclSocketAddress);
+  int tmpFd = sock->fd = -1;
+
+  do {
+    if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0);
+    tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
+  } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag);
+
+  if (!listenSocket->asyncFlag) {
+    EQCHECK(tmpFd, -1);
+  } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+    return ncclSystemError;
+  }
+
+  sock->fd = tmpFd;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
+  if (sock == NULL)
+    return ncclSuccess;
+
+  sock->fd = -1;
+  if (addr) {
+    memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+  } else {
+    memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
+  }
+  sock->abortFlag = abortFlag;
+  sock->asyncFlag = asyncFlag;
+  sock->state = ncclSocketStateNum;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+  int bytes = 0;
+  *closed = 0;
+  char* data = (char*)ptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+  do {
+    if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_RECV && bytes == 0) {
+      *closed = 1;
+      return ncclSuccess;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        return ncclSystemError;
+      } else {
+        bytes = 0;
+      }
+    }
+    (*offset) += bytes;
+    if (sock->abortFlag && *sock->abortFlag != 0) {
+      INFO(NCCL_NET, "Socket progress: abort called");
+      return ncclSystemError;
+    }
+  } while (bytes > 0 && (*offset) < size);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  int closed;
+  NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
+  if (closed) {
+    char line[SOCKET_NAME_MAXLEN+1];
+    WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  while (*offset < size)
+    NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
+  int offset = 0;
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
+  int offset = 0;
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+  return ncclSuccess;
+}
+
+// Receive or detect connection closed
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
+  int offset = 0;
+  *closed = 0;
+  while (offset < size) {
+    NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+    if (*closed) return ncclSuccess;
+  }
+  return ncclSuccess;
+}
diff --git a/src/net.cc b/src/net.cc
new file mode 100644
index 0000000000..934a6faef4
--- /dev/null
+++ b/src/net.cc
@@ -0,0 +1,265 @@
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+ncclNet_t *ncclNet;
+ncclCollNet_t *ncclCollNet;
+
+static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v4_t *ncclNet_v4;
+static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclCollNet_v4_t *ncclCollNet_v4;
+
+static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+  ncclNetProperties_v4_t p4;
+  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
+  if (ans != ncclSuccess) return ans;
+  props->name = p4.name;
+  props->pciPath = p4.pciPath;
+  props->guid = p4.guid;
+  props->ptrSupport = p4.ptrSupport;
+  props->speed = p4.speed;
+  props->port = p4.port;
+  props->maxComms = p4.maxComms;
+  props->maxRecvs = 1;
+  props->latency = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  if (n == 0) return ncclSuccess;
+  if (n != 1) return ncclInvalidArgument;
+  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+  if (n == 0) return ncclSuccess;
+  if (n != 1) return ncclInvalidArgument;
+  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v4->init(logfn));
+  ncclNet_v4_as_v5.name = ncclNet_v4->name;
+  ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
+  ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
+  ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
+  ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
+  ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
+  ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
+  ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
+  ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
+  ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
+  ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
+  ncclNet_v4_as_v5.test = ncclNet_v4->test;
+  ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
+  ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
+  ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+  ncclNetProperties_v4_t p4;
+  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
+  if (ans != ncclSuccess) return ans;
+  props->name = p4.name;
+  props->pciPath = p4.pciPath;
+  props->guid = p4.guid;
+  props->ptrSupport = p4.ptrSupport;
+  props->speed = p4.speed;
+  props->port = p4.port;
+  props->maxComms = p4.maxComms;
+  props->maxRecvs = 1;
+  props->latency = 0;
+  return ncclSuccess;
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v4->init(logfn));
+  ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
+  ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
+  ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
+  ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
+  ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
+  ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
+  ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
+  ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
+  ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
+  ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
+  ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
+  ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
+  ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+  return ncclSuccess;
+}
+
+static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+  char ncclNetPluginName[128];
+  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+  if (envPluginName && strlen(envPluginName)) {
+    snprintf(ncclNetPluginName, 128, "librccl-net-%s.so", envPluginName);
+    INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
+  } else {
+    sprintf(ncclNetPluginName, "librccl-net.so");
+  }
+  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
+  if (netPluginLib == nullptr) {
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
+    return;
+  }
+
+  *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+  if (*net == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
+    ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+    if (ncclNet_v4 == nullptr) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
+      if (netPluginLib != nullptr) dlclose(netPluginLib);
+      return;
+    }
+    *net = &ncclNet_v4_as_v5;
+    ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
+  }
+
+  // Check for CollNet
+  *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+  if (*collnet == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
+    ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+    if (ncclCollNet_v4 == nullptr) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+    } else {
+      *collnet = &ncclCollNet_v4_as_v5;
+      ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+    }
+  }
+  return;
+}
+
+ncclResult_t ncclNetInit() {
+  // Always initialize bootstrap network
+  NCCLCHECK(bootstrapNetInit());
+
+  // Initialize main communication network
+  ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+  ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
+  initPlugin(&nets[0], &collNets[0]);
+  char* netName = getenv("NCCL_NET");
+  bool ok = false;
+
+  for (int i=0; i<3; i++) {
+    if (nets[i] == nullptr) continue;
+    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+
+    // net plugin is already initialized
+    int ndev;
+    if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
+    if (nets[i]->devices(&ndev) != ncclSuccess) continue;
+    if (ndev <= 0) continue;
+    ncclNet = nets[i];
+    ok = true;
+
+    if (collNets[i]) {
+      do {
+        if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
+        if (collNets[i]->devices(&ndev) != ncclSuccess) break;
+        if (ndev <= 0) break;
+        ncclCollNet = collNets[i];
+      } while(0);
+    }
+    break;
+  }
+
+  if (!ok) {
+    WARN("Error: network %s not found.", netName ? netName : "");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+  constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+  int driverVersion;
+  CUDACHECK(cudaDriverGetVersion(&driverVersion));
+  if (driverVersion >= 11030) {
+    int cudaDev, attr = 0;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+    *gdrSupport = attr;
+    return ncclSuccess;
+  }
+#endif
+  int netDevs;
+  NCCLCHECK(ncclNetDevices(&netDevs));
+  *gdrSupport = 0;
+  for (int dev=0; dev<netDevs; dev++) {
+    // Find a net device which is GDR-capable
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNetGetProperties(dev, &props));
+    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    *gdrSupport = 1;
+    break;
+#endif
+
+    // Allocate memory on the GPU and try to register it on the NIC.
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    void* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t ret;
+    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    while (sComm == NULL) {
+      NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+    }
+    while (rComm == NULL) {
+      NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+    }
+    CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
+    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+      *gdrSupport = 1;
+    }
+    ncclDebugNoWarn = 0;
+    CUDACHECK(hipFree(gpuPtr));
+cleanup4:
+    NCCLCHECK(ncclNetCloseRecv(rComm));
+cleanup3:
+    NCCLCHECK(ncclNetCloseSend(sComm));
+cleanup2:
+    NCCLCHECK(ncclNetCloseListen(lComm));
+cleanup1:
+    break;
+  }
+  return ncclSuccess;
+}
+
+int ncclNetVersion() {
+  return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index a6d142beb3..43065cd2d0 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,11 @@
 #include "comm.h"
 #include "info.h"
 #include "collectives.h"
+#include "socket.h"
+#include "shm.h"
+#include "profiler.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 enum { proxyRecv=0, proxySend=1 };
 
@@ -15,7 +20,7 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
 
   /* In chains, one rank does not need a proxy. Let's figure out which one it is */
-  // Which index in the reorganized rings should we compare root against */
+  /* Which index in the reorganized rings should we compare root against */
   const int myrank = 0, nextrank = 1, prevrank = nranks-1;
   int index = pattern == ncclPatternPipelineFrom ?
       /*                            no recv /  no send    if root = */
@@ -25,47 +30,30 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
   return (root != rank);
 }
 
-#define PROXYARGS_ALLOCATE_SIZE 128
+#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS
 struct ncclProxyPool {
   struct ncclProxyPool *next;
   struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
 };
 
-static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
-  struct ncclProxyState* state = &comm->proxyState;
+static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
   struct ncclProxyArgs* elem;
   if (state->pool == NULL) {
-    // Check whether there are freed elements
-    if (state->poolReturned) {
-      pthread_mutex_lock(&state->poolMutex);
-      state->pool = state->poolReturned;
-      state->poolReturned = NULL;
-      pthread_mutex_unlock(&state->poolMutex);
-    } else {
-      // Allocate a new pool of elements. Make sure we allocate the memory close
-      // to the network thread
-      struct ncclProxyPool* newPool;
-      cpu_set_t affinitySave;
-      if (CPU_COUNT(&comm->cpuAffinity)) {
-        sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-        sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-      }
-      NCCLCHECK(ncclCalloc(&newPool, 1));
-      if (CPU_COUNT(&comm->cpuAffinity)) {
-        sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      }
+    // Allocate a new pool of elements. Make sure we allocate the memory close
+    // to the network thread
+    struct ncclProxyPool* newPool;
+    NCCLCHECK(ncclCalloc(&newPool, 1));
 
-      struct ncclProxyArgs* newElems = newPool->elems;
-      // Chain newly allocated elements
-      for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
-        if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
-      }
-      // Add them all to the pool list
-      state->pool = newElems;
-      // Save the pool memory block for later resource release
-      newPool->next = state->pools;
-      state->pools = newPool;
+    struct ncclProxyArgs* newElems = newPool->elems;
+    // Chain newly allocated elements
+    for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+      if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
     }
+    // Add them all to the pool list
+    state->pool = newElems;
+    // Save the pool memory block for later resource release
+    newPool->next = state->pools;
+    state->pools = newPool;
   }
   elem = state->pool;
   state->pool = state->pool->next;
@@ -83,241 +71,394 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
 
 #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
 #define OP_SEEN 0x100000
-ncclResult_t dumpProxyState(struct ncclProxyState* state) {
-#ifdef DEBUG_PROXY
-  struct ncclProxyArgs* op = state->ops;
-  while (op) {
-    if (op->idle & OP_SEEN) {
-      WARN("Active list loop at element %ld", OP_INDEX(op));
+
+ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) {
+  struct ncclProxyPool* pool = state->pools;
+  int p = 0;
+  while (pool) {
+    uint64_t o = op-pool->elems;
+    if (o < PROXYARGS_ALLOCATE_SIZE) {
+      *opIndex = o;
+      *poolIndex = p;
+      return ncclSuccess;
     }
-    op->idle |= OP_SEEN;
-    printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs);
-    if (op->nextPeer) {
-      printf("(%ld)", OP_INDEX(op->nextPeer));
-      struct ncclProxyArgs* n = op->nextPeer;
-      n->idle |= OP_SEEN;
-      while (n->nextPeer) {
-        n = n->nextPeer;
-        n->idle |= OP_SEEN;
+    pool = pool->next;
+    p++;
+  }
+  WARN("Could not find pool of op %p\n", op);
+  return ncclInternalError;
+}
+
+ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) {
+  printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll");
+  for (int s=0; s<op->nsubs; s++) {
+    struct ncclProxySubArgs* sub = op->subs+s;
+    if (op->state == ncclProxyOpProgress) {
+      char status = ' ';
+      if (op->pattern == ncclPatternRecv) {
+        if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+        else if (sub->received < sub->posted) status = 'R'; // Receiving
+        else if (sub->received < sub->transmitted) status = 'R'; // Receiving
+        else if (sub->transmitted < sub->received) status = 'F'; // Flushing
+        else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU
+        else status = 'D'; // Done
+      } else if (op->pattern == ncclPatternSend) {
+        if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+        else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU
+        else if (sub->done < sub->transmitted) status = 'S'; // Sending
+        else status = 'D'; // Done
       }
+      printf(" %d%c/%d", sub->peer, status, sub->channelId);
+    } else {
+      printf(" %d/%d", sub->peer, sub->channelId);
     }
+  }
+  printf("]");
+  return ncclSuccess;
+}
+ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
+  struct ncclProxyArgs* op = state->active;
+  int poolIndex, opIndex;
+  printf("ACTIVE OPS\n");
+  while (op) {
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
+    }
+    NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+    op->state |= OP_SEEN;
+    printf("\n");
+    struct ncclProxyArgs* nextOp = op->nextPeer;
+    while (nextOp) {
+      NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
+      if (nextOp->state & OP_SEEN) {
+        WARN("List loop at element %d-%d", poolIndex, opIndex);
+      }
+      printf("| `-> ");
+      NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex));
+      nextOp->state |= OP_SEEN;
+      printf("\n");
+      if (nextOp->next) {
+        WARN("Inactive op has next set!\n");
+      }
+      nextOp = nextOp->nextPeer;
+    }
+    if (op->nextPeer == NULL) printf("|\n");
+    op = op->next;
+    printf("v\n");
+  }
+  printf("[X]\n");
+
+# if 0
+  printf("FREE OPS\n");
+  op = state->pool;
+  while (op) {
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
+    }
+    NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+    op->state |= OP_SEEN;
     printf("->");
     op = op->next;
   }
   printf("[X]\n");
-
-  struct ncclProxyArgs* free = state->pool;
-  while (free) {
-    if (free->idle & OP_SEEN) {
-      WARN("Free list loop at element %ld", OP_INDEX(free));
+#else
+  op = state->pool;
+  while (op) {
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
     }
-    free->idle |= OP_SEEN;
-    free = free->next;
-  }
-
-  struct ncclProxyPool* p = state->pools;
-  int i = 0;
-  while (p) {
-    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
-      if ((p->elems[e].idle & OP_SEEN) == 0) {
-        WARN("Element %d of pool %d has been lost", e, i);
-        struct ncclProxyArgs* free = state->pool;
-        printf("Free list ");
-        while (free) {
-          printf("--> %ld ", OP_INDEX(free));
-          free = free->next;
-        }
-        printf("\n");
-        return ncclInternalError;
-      }
-      p->elems[e].idle -= OP_SEEN;
-    }
-    p = p->next;
-    i++;
+    op->state |= OP_SEEN;
+    op = op->next;
   }
 #endif
+
+  struct ncclProxyPool* pool = state->pools;
+  poolIndex = 0;
+  while (pool) {
+    struct ncclProxyArgs* elem = pool->elems;
+    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
+      if ((elem->state & OP_SEEN) == 0) {
+        printf("Elem %d-%d is not in any list:\n", poolIndex, e);
+        NCCLCHECK(printProxyOp(elem, poolIndex, e));
+        printf("\n");
+      } else {
+        elem->state -= OP_SEEN;
+      }
+    }
+    pool = pool->next;
+    poolIndex++;
+  }
   return ncclSuccess;
 }
 
-static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) {
-  struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
-  int shared = args->subs[0].connector->conn.shared;
-  if (proxyAppend) {
-    if (shared && proxyAppend->opCount == args->opCount) {
-      if ((proxyAppend->sliceSteps != args->sliceSteps) ||
-          (proxyAppend->chunkSteps != args->chunkSteps) ||
-          (proxyAppend->protocol != args->protocol) ||
-          (proxyAppend->dtype != args->dtype) ||
-          (proxyAppend->redOp != args->redOp)) {
-        WARN("Proxy append mismatch");
-        return ncclInternalError;
-      }
-      if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) {
-        WARN("Proxy append out of bound");
-        return ncclInternalError;
-      }
-      memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs));
-      proxyAppend->nsubs++;
-      args->next = proxyAppend->next;
-      // Free args as we merged them
-      args->next = state->poolFreed;
-      state->poolFreed = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) {
+  struct ncclProxySubArgs* sub = args->subs+subIndex;
+  if (subIndex >= NCCL_PROXY_MAX_SUBS) {
+    WARN("Proxy append out of bounds");
+    return ncclInternalError;
+  }
+
+  //memset(sub, 0, sizeof(struct ncclProxySubArgs));
+  sub->connection = op->connection;
+  sub->channelId = op->channelId;
+  sub->nsteps = op->nsteps;
+  sub->nbytes = op->nbytes;
+  sub->peer = op->root;
+  args->nsubs = subIndex+1;
+  if (subIndex) {
+    if ((args->sliceSteps != op->sliceSteps) ||
+        (args->chunkSteps != op->chunkSteps) ||
+        (args->protocol != op->protocol) ||
+        (args->dtype != op->dtype) ||
+        (args->redOp != op->redOp)) {
+      WARN("Proxy append mismatch");
+      return ncclInternalError;
+    }
+    if (args->state != ncclProxyOpReady) {
+      WARN("Proxy append on running operation");
+      return ncclInternalError;
+    }
+    return ncclSuccess;
+  }
+  //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress));
+  args->done = 0;
+  args->opCount = op->opCount;
+  args->sliceSteps = op->sliceSteps;
+  args->chunkSteps = op->chunkSteps;
+  args->chunkSize = op->chunkSize;
+  args->dtype = op->dtype;
+  args->redOp = op->redOp;
+  args->pattern = op->pattern;
+  args->protocol = op->protocol;
+  args->state = ncclProxyOpReady;
+  args->progress = op->connection->tcomm->proxyProgress;
+  args->proxyAppendPtr = op->connection->proxyAppendPtr;
+  return ncclSuccess;
+}
+
+static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) {
+  struct ncclProxyConnection* connection = op->connection;
+  int shared = connection->shared;
+  struct ncclProxyArgs* args = *connection->proxyAppendPtr;
+
+  if (args) {
+    if (shared && args->opCount == op->opCount) {
+      NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs));
+      DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
     } else {
-      proxyAppend->nextPeer = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+      struct ncclProxyArgs* prevArgs = args;
+      NCCLCHECK(allocateArgs(state, &args));
+      NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+      prevArgs->nextPeer = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
       *(args->proxyAppendPtr) = args;
     }
   } else {
     // Nothing running for that peer. Add to the list
-    if (state->ops == NULL) {
+    NCCLCHECK(allocateArgs(state, &args));
+    NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+    if (state->active == NULL) {
       // Create the list
       DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
-      state->ops = args;
+      state->active = args;
     } else {
       // Append element at the end of the list
-      struct ncclProxyArgs* last = state->ops;
+      struct ncclProxyArgs* last = state->active;
       while (last->next) last = last->next;
       last->next = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount);
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
     }
     *(args->proxyAppendPtr) = args;
   }
   return ncclSuccess;
 }
 
-static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) {
-  if (peer < 0) return ncclSuccess;
-
-  struct ncclChannel* channel = args->subs[0].channel;
-  struct ncclPeer* peerComm = channel->peers+peer;
-  struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
-  if (connector->transportComm == NULL) {
-    WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank,
-        type == proxyRecv ? "recv" : "send", peer, channel->id);
-    return ncclInternalError;
+ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
+  pthread_mutex_lock(&pool->mutex);
+  if (pool->nextOps == -1) {
+    pool->nextOps = nextOps;
+    pthread_cond_signal(&pool->cond);
+  } else {
+    pool->ops[pool->nextOpsEnd].next = nextOps;
   }
-  if (connector->transportComm->proxy == NULL) return ncclSuccess;
-
-  struct ncclProxyState* state = &connector->comm->proxyState;
-  struct ncclProxyArgs* op;
-  NCCLCHECK(allocateArgs(connector->comm, &op));
-  memcpy(op, args, sizeof(struct ncclProxyArgs));
-  op->subs[0].connector = connector;
-  op->progress = connector->transportComm->proxy;
-  op->state = ncclProxyOpReady;
-  op->proxyAppendPtr = connector->proxyAppendPtr;
-
-  if (state->nextOps == NULL) state->nextOps = op;
-  else state->nextOpsEnd->next = op;
-  state->nextOpsEnd = op;
+  pool->nextOpsEnd = nextOpsEnd;
+  pthread_mutex_unlock(&pool->mutex);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) {
-  struct ncclChannel* channel = args->subs[0].channel;
-  int pattern = args->pattern;
+ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) {
+  struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps;
+  if (proxyOps == NULL) return ncclInternalError;
+  proxyOps += proxyConn->localRank;
+  struct ncclProxyOpsPool* pool = proxyOps->pool;
+
+  TIME_START(0);
+  int opIndex = proxyOps->freeOp;
+  struct ncclProxyOp* op;
+  if (opIndex != -1) {
+    op = pool->ops+opIndex;
+    proxyOps->freeOp = op->next;
+  } else {
+    int freeOp;
+    while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield();
+    int freeOpNew;
+    while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
+    opIndex = freeOp;
+    op = pool->ops+opIndex;
+    proxyOps->freeOp = op->next;
+  }
+  if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
+  memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+  op->next = -1;
+  op->connection = proxyConn->connection;
+  if (proxyOps->nextOps == -1) {
+    proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
+  } else {
+    pool->ops[proxyOps->nextOpsEnd].next = opIndex;
+    proxyOps->nextOpsEnd = opIndex;
+  }
+  if (++proxyOps->count == MAX_OPS_PER_PEER) {
+    // Post what we have so far to free some ops in the pool
+    // Do not post last operations as we could have more coming with the same opCount, and posting
+    // them in different batches would break proxyArgs aggregation with subs.
+    uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
+    int lastOp = -1;
+    int toSend = 0;
+    int ops = 0;
+    for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) {
+      ops++;
+      if (pool->ops[op].opCount != lastOpCount) {
+        lastOp = op;
+        toSend = ops;
+      }
+    }
+    if (lastOp == -1) {
+      WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
+      return ncclInternalError;
+    }
+    // Cut chain at lastOp
+    int nextOps = proxyOps->nextOps;
+    proxyOps->nextOps = pool->ops[lastOp].next;
+    pool->ops[lastOp].next = -1;
+    NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp));
+    proxyOps->count -= toSend;
+  }
+  TIME_STOP(0);
+  return ncclSuccess;
+}
+
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
+  if (peer < 0) return ncclSuccess;
+
+  struct ncclPeer* peerComm = channel->peers+peer;
+  struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
+  if (connector->transportComm == NULL) {
+    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
+        type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
+    return ncclInternalError;
+  }
+  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
+
+  NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
+  struct ncclChannel* channel = comm->channels+op->channelId;
+  int pattern = op->pattern;
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
     struct ncclRing* ring = &channel->ring;
-    if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, args->connIndex));
-    if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, args->connIndex));
+    if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex));
+    if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex));
   }
   if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
     // Tree up
     struct ncclTree* tree = &channel->tree;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args, 0));
-    NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0));
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
+    NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
   }
   if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
     // Tree down
     struct ncclTree* tree = &channel->tree;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0));
-    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0));
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
+    NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
   }
   if (pattern == ncclPatternCollTreeUpDown) {
     // CollTree up
-    NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1));  // For CollTree up, we are using push
+    NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1));  // For CollTree up, we are using push
     // CollTree down
-    NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0));
+    NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) {
-  memset(args, 0, sizeof(struct ncclProxyArgs));
-  int channelId = info->channelId;
-  args->nsubs = 1;
-  struct ncclProxySubArgs* sub = args->subs;
+NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
 
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
+  memset(op, 0, sizeof(struct ncclProxyOp));
+  int channelId = info->channelId;
   struct ncclChannel* channel = info->comm->channels+channelId;
-  sub->channel = channel;
-  args->sliceSteps = 1;
-  args->chunkSteps = 1;
-  args->protocol = NCCL_PROTO_SIMPLE;
-  args->dtype = info->datatype;
-  sub->delta = info->delta;
-  sub->recvbytes = info->recvbytes;
-  sub->sendbytes = info->sendbytes;
+  op->channelId = channelId;
+  op->sliceSteps = 1;
+  op->chunkSteps = 1;
+  op->protocol = NCCL_PROTO_SIMPLE;
+  op->dtype = info->datatype;
+  op->connIndex = info->connIndex;
 
   int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
-  info->recvChunkSize = stepSize;
-  info->sendChunkSize = stepSize;
+  info->chunkSize = stepSize;
+  op->root = info->root;
+  op->nbytes = info->count;
+  struct ncclPeer* peer = channel->peers + op->root;
 
-  if (info->delta > 0 && info->recvbytes >= 0) {
-    int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
-    if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) {
+  if (info->coll == ncclFuncSend) {
+    op->pattern = ncclPatternSend;
+    if (op->root != info->comm->rank && peer->send[info->connIndex].transportComm && peer->send[info->connIndex].transportComm->proxyProgress) {
       // Tune chunk size for the network
-      if (info->recvbytes < stepSize) info->recvChunkSize /= 4;
-      else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2;
+      if (info->count < stepSize) info->chunkSize /= 4;
+      else if (info->count < 8*stepSize) info->chunkSize /= 2;
     }
-    sub->recvChunkSize = info->recvChunkSize;
+  } else if (info->coll == ncclFuncRecv) {
+    op->pattern = ncclPatternRecv;
+    if (op->root != info->comm->rank && peer->recv[info->connIndex].transportComm && peer->recv[info->connIndex].transportComm->proxyProgress) {
+      // Tune chunk size for the network
+      if (info->count < stepSize) info->chunkSize /= 4;
+      else if (info->count < 8*stepSize) info->chunkSize /= 2;
+    }
+  } else {
+    WARN("P2p operation is neither send or recv");
+    return ncclInternalError;
   }
-  if (info->delta > 0 && info->sendbytes >= 0) {
-    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
-    if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) {
-      // Tune chunk size for the network
-      if (info->sendbytes < stepSize) info->sendChunkSize /= 4;
-      else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2;
-    }
-    sub->sendChunkSize = info->sendChunkSize;
+  if (ncclParamChunkSize() != 0) {
+    info->chunkSize = ncclParamChunkSize();
+  }
+  op->chunkSize = info->chunkSize;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
+  struct ncclChannel* channel = comm->channels+op->channelId;
+  op->opCount = channel->workFifoTail-1;
+  if (op->root == comm->rank) return ncclSuccess;
+  if (op->pattern == ncclPatternRecv) {
+    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+    if (op->nsteps == 0) op->nsteps = 1;
+    NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, op->connIndex));
+  } else if (op->pattern == ncclPatternSend) {
+    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+    if (op->nsteps == 0) op->nsteps = 1;
+    NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, op->connIndex));
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) {
-  struct ncclProxySubArgs* sub = args->subs;
-  struct ncclChannel* channel = sub->channel;
-  args->opCount = channel->workFifoTail-1;
-  args->commOpCount = comm->opCount;
-  const ssize_t recvbytesOrig = sub->recvbytes;
-  const ssize_t sendbytesOrig = sub->sendbytes;
-  if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) {
-    int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks;
-    sub->recvbytes = recvbytesOrig;
-    sub->sendbytes = 0;
-    sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
-    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, args->recvIdx));
-  }
-  if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
-    int peersend = (comm->rank+sub->delta)%comm->nRanks;
-    sub->sendbytes = sendbytesOrig;
-    sub->recvbytes = 0;
-    sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
-    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxySend, peersend, args, args->sendIdx));
-  }
-  // Reset proxy args for potentially multiple cuda graph launches
-  // It is safe as long as SaveProxy copies contents of args to op
-  sub->recvbytes = recvbytesOrig;
-  sub->sendbytes = sendbytesOrig;
-  return ncclSuccess;
-}
-
-static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
+static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
   struct ncclProxyArgs* freeOp = *opPtr;
-  DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next));
   struct ncclProxyArgs* next = freeOp->next;
+  DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
   *opPtr = next;
   if (freeOp->nextPeer) {
     // replace op by nextPeer
@@ -325,7 +466,7 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
     if (*prevOpPtr) {
       (*prevOpPtr)->next = nextPeer;
     } else {
-      state->ops = nextPeer;
+      state->active = nextPeer;
     }
     nextPeer->next = next;
     *(prevOpPtr) = nextPeer;
@@ -334,25 +475,31 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
     if (*prevOpPtr) {
       (*prevOpPtr)->next = next;
     } else {
-      state->ops = next;
+      state->active = next;
     }
   }
-  freeOp->next = state->poolFreed;
-  state->poolFreed = freeOp;
-  DEBUG_PROXY_PRINT("Removed %5ld (%5ld)                                               : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+  freeOp->next = state->pool;
+  state->pool = freeOp;
+  DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+#ifdef DEBUG_PROXY
   NCCLCHECK(dumpProxyState(state));
+#endif
   return ncclSuccess;
 }
 
-static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
+static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
   struct ncclProxyArgs* prevOp = NULL;
-  struct ncclProxyArgs* op = *opsPtr;
+  struct ncclProxyArgs* op = opStart;
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
-    NCCLCHECK(op->progress(op));
+    TIME_START(0); TIME_START(1);
+    NCCLCHECK(op->progress(comm, op));
+    if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
     if (op->state == ncclProxyOpNone) {
+      TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
+      TIME_STOP(2);
     } else {
       prevOp = op;
       op = op->next;
@@ -361,197 +508,606 @@ static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyAr
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) {
-  // Return any freed element first
-  if (state->poolFreed) {
-    struct ncclProxyArgs* end = state->poolFreed;
-    while (end->next) end = end->next;
-    pthread_mutex_lock(&state->poolMutex);
-    end->next = state->poolReturned;
-    state->poolReturned = state->poolFreed;
-    pthread_mutex_unlock(&state->poolMutex);
-    state->poolFreed = NULL;
+static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) return ncclInternalError;
+  struct ncclProxyOpsPool* pool = state->opsPool;
+
+  struct ncclProxyArgs profArgs; // Only used for profiling purposes
+  if (state->nextOps != -1) goto process_nextops;
+
+  // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
+  // to be available. Exit, continue progress, and come back later.
+  if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
+
+  if (state->active == NULL) {
+    pthread_mutex_lock(&pool->mutex);
+    while (pool->nextOps == -1 && !state->stop) {
+      struct ncclProxyArgs profArgs; // Only used for profiling purposes
+      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
+      pthread_cond_wait(&pool->cond, &pool->mutex);
+      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
+    }
+    if (state->stop) { // We might have been woken up to stop.
+      pthread_mutex_unlock(&pool->mutex);
+      return ncclSuccess;
+    }
   }
 
-  // Then wait until we have new work to do
-  pthread_mutex_lock(&state->opsMutex);
-  while (state->postedOps == NULL) {
-    if (state->stop) return ncclSuccess;
-    pthread_cond_wait(&state->cond, &state->opsMutex);
+  state->nextOps = pool->nextOps;
+  pool->nextOps = pool->nextOpsEnd = -1;
+  pthread_mutex_unlock(&pool->mutex);
+  if (state->nextOps == -1) return ncclInternalError;
+
+process_nextops:
+  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
+  TIME_START(2);
+  int freeOp[NCCL_MAX_LOCAL_RANKS];
+  int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
+  for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
+
+  for (int opIndex = state->nextOps; opIndex != -1;) {
+    struct ncclProxyOp* peerOp = pool->ops+opIndex;
+    int peer = opIndex / MAX_OPS_PER_PEER;
+    if (peerOp->connection == NULL) return ncclInternalError;
+    if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
+    NCCLCHECK(ProxyAppend(state, peerOp));
+    (*added)++;
+    int lastOpIndex = opIndex;
+    opIndex = peerOp->next;
+    // Return op to peer pool
+    if (freeOp[peer] == -1) {
+      freeOpEnd[peer] = lastOpIndex;
+    } else {
+      peerOp->next = freeOp[peer];
+    }
+    freeOp[peer] = lastOpIndex;
+    state->nextOps = opIndex;
   }
 
-  // Sort operations as we append them : collectives and
-  // receives first, then sends.
-
-  struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps;
-  int commOpCount = op->commOpCount;
-  while (op && op->commOpCount == commOpCount) {
-    next = op->next;
-    if (op->subs[0].sendbytes) {
-      if (prev) prev->next = next;
-      else state->postedOps = next;
-      op->next = NULL;
-      NCCLCHECK(ProxyAppend(state, op));
-    } else prev = op;
-    op = next;
+  for (int i=0; i<comm->localRanks; i++) {
+    if (freeOp[i] == -1) continue;
+    int newFree = freeOp[i];
+    int oldFree = pool->freeOps[i];
+    pool->ops[freeOpEnd[i]].next = oldFree;
+    if (oldFree == -1) {
+      // Nothing for the main thread to consume, we can set it.
+      pool->freeOps[i] = newFree;
+    } else {
+      // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
+      int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree);
+      if (swap != oldFree) {
+        if (swap != -1) return ncclInternalError;
+        // Ops were recycled while we were trying to swap, just set the value directly now.
+        pool->ops[freeOpEnd[i]].next = -1;
+        pool->freeOps[i] = newFree;
+      }
+    }
   }
-  op = state->postedOps;
-  while (op && op->commOpCount == commOpCount) {
-    next = op->next;
-    op->next = NULL;
-    NCCLCHECK(ProxyAppend(state, op));
-    op = next;
-  }
-  state->postedOps = op;
-  if (op == NULL) state->postedOpsEnd = NULL;
-  NCCLCHECK(dumpProxyState(state));
-  pthread_mutex_unlock(&state->opsMutex);
-
-  if (state->poolFreed) {
-    struct ncclProxyArgs* end = state->poolFreed;
-    while (end->next) end = end->next;
-    pthread_mutex_lock(&state->poolMutex);
-    end->next = state->poolReturned;
-    state->poolReturned = state->poolFreed;
-    pthread_mutex_unlock(&state->poolMutex);
-    state->poolFreed = NULL;
-  }
-
+  profArgs.opCount = *added;
+  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
+  TIME_STOP(2);
   return ncclSuccess;
 }
 
+#include <signal.h>
+static ncclProxyProgressState* ncclLastProxyState;
+void ncclDumpProxyState(int signal) {
+  dumpProxyState(ncclLastProxyState);
+}
 
-void* persistentThread(void *comm_) {
+void* ncclProxyProgress(void *comm_) {
   struct ncclComm* comm = (struct ncclComm*)comm_;
-  struct ncclProxyState* state = &comm->proxyState;
-  char threadName[16];
-  sprintf(threadName, "NCCLproxy %5d", comm->rank);
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  state->nextOps = -1;
+  signal(SIGUSR1, ncclDumpProxyState);
+  ncclLastProxyState = state;
+  char threadName[NCCL_THREAD_NAMELEN];
+  snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
   nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
 
-  struct ncclProxyArgs** opsPtr = &state->ops;
-  while (1) {
-    if (*comm->abortFlag) {
-      return NULL;
-    }
-
-    while (*opsPtr == NULL) {
-      if (state->stop) {
-        // No more commands to process and proxy has been requested to stop
-        return NULL;
-      }
-      ncclResult_t ret = ncclProxyAppendPosted(state);
-      if (ret != ncclSuccess) {
-        comm->fatalError = ret;
-        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
-        return NULL;
-      }
-    }
+  int lastIdle = 0;
+  struct ncclProxyArgs profArgs; // Only used for profiling purposes
+  while (state->stop == 0 && *comm->abortFlag == 0) {
     int idle = 1;
-    ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
+    ncclResult_t ret = progressOps(comm, state, state->active, &idle);
     if (ret != ncclSuccess) {
       comm->fatalError = ret;
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
+    if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
+    if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
     if (idle) {
-      sched_yield(); // No request progressed. Let others run.
+      int added = 0;
+      TIME_START(3);
+      ret = ncclProxyGetPostedOps(comm, &added);
+      if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
+      if (ret != ncclSuccess) {
+        comm->fatalError = ret;
+        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+      }
+      if (added == 0) {
+        sched_yield(); // No request progressed. Let others run.
+      }
     }
+    lastIdle = idle;
   }
+  return NULL;
 }
 
 ncclResult_t ncclProxyStart(struct ncclComm* comm) {
-  struct ncclProxyState* state = &comm->proxyState;
-  if (state->nextOps == NULL) return ncclSuccess;
-  pthread_mutex_lock(&state->opsMutex);
-  if (state->postedOps) state->postedOpsEnd->next = state->nextOps;
-  else state->postedOps = state->nextOps;
-  state->postedOpsEnd = state->nextOpsEnd;
-  state->nextOps = state->nextOpsEnd = NULL;
-  pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->opsMutex);
+  struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps;
+  if (proxyOps == NULL) return ncclSuccess;
+  TIME_START(1);
+  for (int r=0; r<comm->localRanks; r++) {
+    struct ncclProxyOps* ops = proxyOps+r;
+    if (ops->pool == NULL || ops->nextOps == -1) continue;
+    NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
+    ops->nextOps = ops->nextOpsEnd = -1;
+    ops->count = 0;
+  }
   comm->opCount++;
+  TIME_STOP(1);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  if (state->size == 0) {
-    int p2pnChannels = 1;
-    while (p2pnChannels < comm->nChannels) p2pnChannels *= 2;
-    int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
-    int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
-    state->size = std::max(p2pSize, collNetSize);
+ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (!state->thread) {
+    pthread_create(&state->thread, NULL, ncclProxyProgress, comm);
+    ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+
+  // Request the proxy to stop and then wake it
+  if (state->opsPool) {
+    pthread_mutex_lock(&state->opsPool->mutex);
+    state->stop = true;
+    pthread_cond_signal(&state->opsPool->cond);
+    pthread_mutex_unlock(&state->opsPool->mutex);
+    pthread_join(state->thread, NULL);
   }
 
-  *size = state->size;
-
-  if (cuda && state->cudaBuff == NULL) {
-    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda));
-  } else if (state->hostBuff == NULL) {
-    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+  // Free off any memory allocated for the proxy arg pools
+  while (state->pools != NULL) {
+    struct ncclProxyPool *next = state->pools->next;
+    free(state->pools);
+    state->pools = next;
   }
-  *ptr = cuda ? state->cudaBuff : state->hostBuff;
+
+  ncclProfilingDump();
+  TIME_PRINT("Proxy");
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  // Use different pools for separate send/recv.
-  char* buff = cuda ? state->cudaBuff : state->hostBuff;
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
-  int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index;
-  *ptr = buff + slotSize * globalSlot;
-  return ncclSuccess;
-}
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  // Use different pools for different channels.
-  char* buff = cuda ? state->cudaBuff : state->hostBuff;
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
-  *ptr = buff + slotSize * globalSlot;
+struct ncclProxyAsyncOp {
+  int type;
+  struct ncclProxyConnection* connection;
+  int reqSize, respSize;
+  char *reqBuff, *respBuff;
+};
+
+struct ncclProxyLocalPeer {
+  struct ncclSocket sock;
+  int localRank;
+  struct ncclProxyAsyncOp asyncOps;
+};
+
+#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
+#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
+#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
+struct ncclProxyConnectionPool {
+  struct ncclProxyConnection** pools;
+  int banks;
+  int offset;
+  struct ncclProxyAsyncOp* ops;
+};
+
+static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
+  if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) {
+    NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
+    NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE));
+    pool->banks++;
+    pool->offset = 0;
+  }
+  *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
+  pool->offset++;
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  CUDACHECK(hipFree(state->cudaBuff));
-  NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) {
+  int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2;
+  int offset = id&NCCL_PROXY_CONN_POOL_MASK;
+  if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError;
+  *conn = pool->pools[bank]+offset;
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  if (connection->send) {
+    NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+  } else {
+    NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) {
+  for (int b=0; b<pool->banks; b++) {
+    int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
+    for (int i=0; i<max; i++) {
+      NCCLCHECK(proxyFree(pool->pools[b]+i, comm));
+    }
+    free(pool->pools[b]);
+  }
+  free(pool->pools);
+  return ncclSuccess;
+}
+
+#include "transport.h"
+
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
+  // Keep one connection per mlocal rank
+  proxyConn->connection = NULL;
+  proxyConn->rank = rank;
+  if (comm->proxyState.peerSocks == NULL) {
+    NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
+    NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
+    NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
+    for (int r=0; r<comm->localRanks; r++) {
+      comm->proxyState.peerSocks[r].fd = -1;
+      comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag;
+    }
+  }
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
+  struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
+  if (sock->fd == -1) {
+    memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
+    NCCLCHECK(ncclSocketConnect(sock));
+  }
+  int type = ncclProxyMsgInit;
+  NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+  // If we need proxy progress, map progress ops
+  if (tcomm->proxyProgress) {
+    char poolPath[] = "/dev/shm/nccl-XXXXXX";
+    NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
+    struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
+    if (proxyOps->pool == NULL) {
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
+      proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
+    }
+  }
+  INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection);
+  proxyConn->comm = comm;
+  return ncclSuccess;
+}
+
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+  if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
+  if (sock->fd == -1) return ncclInternalError;
+  ncclResult_t ret;
+
+  NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
+  if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
+  if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
+  return ncclSuccess;
+error:
+  WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
+  sock->fd = -1;
+  return ret;
+}
+
+static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) {
+    int size = sizeof(struct ncclProxyOpsPool);
+    struct ncclProxyOpsPool* pool = NULL;
+
+    char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
+    shmPath[0] = '\0';
+    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
+
+    // Init pool
+    pool->nextOps = -1;
+
+    // The service thread may be launched already but localRanks may not be set yet.
+    while (comm->localRanks == 0) sched_yield();
+
+    for (int r=0; r<comm->localRanks; r++) {
+      pool->freeOps[r] = r*MAX_OPS_PER_PEER;
+      for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
+      pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1;
+    }
+
+    // Setup mutex/cond to work inter-process
+    pthread_mutexattr_t mutexAttr;
+    pthread_mutexattr_init(&mutexAttr);
+    pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
+    pthread_mutex_init(&pool->mutex, &mutexAttr);
+    pthread_condattr_t condAttr;
+    pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
+    pthread_cond_init(&pool->cond, &condAttr);
+    state->opsPool = pool;
+
+    memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
+
+    // All ops structures are created, we can start the progress thread
+    NCCLCHECK(ncclProxyProgressCreate(comm));
+  }
+  return ncclSuccess;
+}
+
+static void proxyOpsFree(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
+    WARN("[Service thread] shm close failed");
+  }
+}
+
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) return ncclSuccess;
+
+  char shmPath[] = "/dev/shm/nccl-XXXXXX";
+  memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
+  if (ncclShmUnlink(shmPath) != ncclSuccess) {
+    WARN("[Service thread] shm unlink failed");
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+  struct ncclSocket* sock = &peer->sock;
+  int id;
+  struct ncclProxyConnection* connection;
+  NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
+  NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection));
+  connection->sock = sock;
+  NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
+  connection->localRank = peer->localRank;
+  NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
+  connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+  // If we need proxy progress, let's allocate ops and start the thread
+  if (connection->tcomm->proxyProgress) {
+    NCCLCHECK(proxyProgressInit(comm));
+    struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+    NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
+  }
+  INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+  struct ncclSocket* sock = &peer->sock;
+  struct ncclProxyConnection* connection;
+  NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*)));
+  int reqSize, respSize;
+  NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
+  if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
+  int nChannels;
+  NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
+  if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
+  int done = 1;
+  if (op->type == ncclProxyMsgSetup) {
+    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+  } else if (op->type == ncclProxyMsgConnect) {
+    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+  } else return ncclInternalError;
+  if (done) {
+    if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
+    if (op->reqBuff) free(op->reqBuff);
+    if (op->respBuff) free(op->respBuff);
+    op->reqBuff = NULL;
+    op->respBuff = NULL;
+    op->type = 0;
+    (*asyncOpCount)--;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
+  struct ncclSocket* sock = &peer->sock;
+  struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
+  asyncOp->type = type;
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
+
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
+  if (asyncOp->reqSize) {
+    NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
+    NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
+  }
+  if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
+  (*asyncOpCount)++;
+  NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
+  return ncclSuccess;
+}
+
+#include <poll.h>
+
+void* ncclProxyService(void* _args) {
+  struct ncclComm* comm =  (struct ncclComm *) _args;
+  if (hipSetDevice(comm->cudaDev) != hipSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
+  }
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
+  // Prepare poll descriptor
+  struct ncclProxyConnectionPool connectionPool;
+  connectionPool.pools = NULL;
+  connectionPool.banks = 0;
+  connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
+
+  struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
+  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
+  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
+    peers[s].sock.fd = pollfds[s].fd = -1;
+    peers[s].sock.abortFlag = NULL;
+    peers[s].sock.asyncFlag = 0;
+    pollfds[s].events = POLLHUP|POLLIN;
+    peers[s].asyncOps.type = 0;
+  }
+  pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
+  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
+
+  int maxnpeers = 0;
+  int npeers = 0;
+  int stop = 0;
+  int asyncOpCount = 0;
+  while (stop == 0 || (stop == 1 && npeers > 0)) {
+    if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) {
+      WARN("[Proxy Service] Poll failed with error %d", error);
+      return NULL;
+    }
+    if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
+      int s = 0;
+      while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
+      if (s == NCCL_MAX_LOCAL_RANKS) {
+        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
+        return NULL;
+      }
+      if (maxnpeers < s+1) maxnpeers = s+1;
+      struct ncclSocket* sock = &peers[s].sock;
+      if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
+        WARN("[Service thread] Accept failed %s", strerror(errno));
+      } else {
+        pollfds[s].fd = sock->fd;
+        npeers++;
+        peers[s].localRank = -1;
+      }
+    }
+    for (int s=0; s<maxnpeers; s++) {
+      struct ncclProxyLocalPeer* peer = peers+s;
+      struct ncclSocket* sock = &peer->sock;
+      struct ncclProxyAsyncOp* op = &peer->asyncOps;
+      int closeConn = 0;
+      int type = 0;
+      ncclResult_t res = ncclSuccess;
+      if (op->type != 0) {
+        res = proxyProgressAsync(op, comm, &asyncOpCount);
+        type = op->type;
+        if (res != ncclSuccess) op->type = 0;
+      } else if (pollfds[s].revents & POLLIN) {
+        int closed;
+        if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
+          WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
+          closeConn = 1;
+        } else if (closed) {
+          INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
+          closeConn = 1;
+        } else {
+          if (type == ncclProxyMsgAbort) {
+            stop = 2;
+            closeConn = 1;
+          } else if (type == ncclProxyMsgStop) {
+            stop = 1;
+            closeConn = 1;
+          } else if (type == ncclProxyMsgClose) {
+            closeConn = 1;
+          } else if (type == ncclProxyMsgInit) {
+            res = proxyConnInit(peers+s, &connectionPool, comm);
+          } else if (type == ncclProxyMsgSharedInit) {
+            res = proxyConnSharedInit(peers+s, &connectionPool, comm);
+          } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
+            res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
+          } else {
+            WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
+            closeConn = 1;
+          }
+        }
+      } else if (pollfds[s].revents & POLLHUP) {
+        closeConn = 1;
+      } 
+      if (res != ncclSuccess) {
+        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
+        closeConn = 1;
+      }
+      if (closeConn) {
+        close(sock->fd);
+        sock->fd = pollfds[s].fd = -1;
+        npeers--;
+      }
+    }
+  }
+  // Wait for all operations to complete and stop progress thread before freeing any resource
+  if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
+    WARN("[Proxy Service] proxyDestroy failed");
+  }
+  for (int s=0; s<maxnpeers; s++) {
+    if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
+  }
+  ncclProxyFreeConnections(&connectionPool, comm);
+  close(comm->proxyState.listenSock->fd);
+  free(comm->proxyState.listenSock);
+  proxyOpsFree(comm);
+  return NULL;
+}
+
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
+  comm->proxyState.listenSock = sock;
+  comm->proxyState.peerAddresses = peerAddresses;
+  ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
-  if (!comm->proxyThread) {
-    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
-    comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
-    comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
-    comm->proxyState.ops = NULL;
-    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
-  }
+  // comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
+  pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
   struct ncclProxyState* state = &comm->proxyState;
-
-  // Request the proxy to stop and then wake it
-  pthread_mutex_lock(&state->opsMutex);
-  state->stop = true;
-  pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->opsMutex);
-  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
-
-  // Free off any memory allocated for the proxy arg pools
-  pthread_mutex_lock(&state->poolMutex);
-  struct ncclProxyState* proxyState = &comm->proxyState;
-  while (proxyState->pools != NULL) {
-    struct ncclProxyPool *next = proxyState->pools->next;
-    free(proxyState->pools);
-    proxyState->pools = next;
+  if (state->peerAddresses) {
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    sock.asyncFlag = 0;
+    memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
+    NCCLCHECK(ncclSocketConnect(&sock));
+    int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop;
+    NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
+    close(sock.fd);
+    free(state->peerAddresses);
+  }
+  if (state->peerSocks) {
+    for (int i=0; i<comm->localRanks; i++) {
+      if (state->peerSocks[i].fd != -1) {
+        if (state->proxyOps[i].pool) {
+          NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
+        }
+        if (state->sharedDevMems[i]) {
+          CUDACHECK(hipIpcCloseMemHandle(state->sharedDevMems[i]));
+        }
+        int type = ncclProxyMsgClose;
+        if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
+        close(state->peerSocks[i].fd);
+      }
+    }
+    free(state->peerSocks);
+    free(state->proxyOps);
+    free(state->sharedDevMems);
   }
-  pthread_mutex_unlock(&state->poolMutex);
-
-  NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
-
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index 62940498cb..6b279cbdbb 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,15 +8,19 @@
 #include "comm.h"
 #include "info.h"
 #include "bootstrap.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
 
 struct ncclTransport ncclTransports[NTRANSPORTS] = {
   p2pTransport,
   shmTransport,
   netTransport,
+  collNetTransport
 };
 
 template <int type>
@@ -25,14 +29,12 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
   struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
   struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
                                                   comm->channels[channelId].peers[peer].recv + connIndex;
-
   // handle intra-node network connections
   int n1 = -1, n2 = -1;
   if (connIndex == NCCL_CONN_IDX_P2P_NET) {
     NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, comm->rank, graph, channelId, (type == 1) ? 1 : 0, &n1));
     NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
   }
-
   bool xgmi;
   NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
   for (int t=0; t<NTRANSPORTS; t++) {
@@ -79,11 +81,9 @@ void dumpData(struct ncclConnect* data, int ndata) {
 }
 
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
-#if CUDART_VERSION >= 11030
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   hipStream_t transportSetupStream;
   CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
-#endif
   int highestType = TRANSPORT_P2P;  // track highest transport type
 
   struct ncclConnect data[2*MAXCHANNELS];
@@ -97,12 +97,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     struct ncclConnect* recvData = data;
     int sendChannels = 0, recvChannels = 0;
     int type;
+    TIME_START(0);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1<<c)) {
         NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
         if (type > highestType) highestType = type;
       }
     }
+    TIME_STOP(0);
+    TIME_START(1);
     struct ncclConnect* sendData = recvData+recvChannels;
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1<<c)) {
@@ -110,7 +113,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         if (type > highestType) highestType = type;
       }
     }
+    TIME_STOP(1);
 
+    TIME_START(2);
     if (sendPeer == recvPeer) {
       if (recvChannels+sendChannels) {
          NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
@@ -124,38 +129,34 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
       if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
       if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
     }
+    TIME_STOP(2);
 
+    TIME_START(3);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1<<c)) {
         struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
         NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
         conn->connected = 1;
-#if CUDART_VERSION >= 11030
         CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
-#else
-        CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-#endif
       }
     }
+    TIME_STOP(3);
+    TIME_START(4);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1<<c)) {
         struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
         NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
         conn->connected = 1;
-#if CUDART_VERSION >= 11030
         CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
-#else
-        CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-#endif
       }
     }
+    TIME_STOP(4);
     comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
   }
-#if CUDART_VERSION >= 11030
   CUDACHECK(hipStreamSynchronize(transportSetupStream));
   CUDACHECK(hipStreamDestroy(transportSetupStream));
-#endif
   if (highestTransportType != NULL) *highestTransportType = highestType;
+  TIME_PRINT("P2P Setup/Connect");
   return ncclSuccess;
 }
 
@@ -250,9 +251,9 @@ cleanup:
 
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
   // AllGather collNet setup results
-  int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
-  allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
+  int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0};
+  allGatherFailures[comm->localRank] = collNetSetupFail;
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int)));
   for (int i=0; i<comm->localRanks; i++) {
     if (allGatherFailures[i] != 0) {
       collNetSetupFail = 1;
@@ -260,7 +261,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
     }
   }
   if (collNetSetupFail) {
-    if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
+    if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -273,12 +274,12 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
     struct ncclPeer* peer = channel->peers+comm->nRanks;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       struct ncclConnector* send = peer->send + b;
-      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
       send->transportResources = NULL; // avoid double free
     }
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       struct ncclConnector* recv = peer->recv + b;
-      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
       recv->transportResources = NULL; // avoid double free
     }
   }
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 9f9d9b5dd1..01f3ee6807 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,11 +8,15 @@
 #include "comm.h"
 #include "coll_net.h"
 #include "graph.h"
+#include "proxy.h"
+#include "gdrwrap.h"
 
-#define COLLNET_GROUP_NSUBS 8
-#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+int64_t ncclParamGdrCopySyncEnable();
+int64_t ncclParamGdrCopyFlushEnable();
 
 struct collNetRecvConnectInfo {
+  int rank;
+  int nranks;
   collNetHandle_t collNetHandle;
 };
 
@@ -21,132 +25,287 @@ struct collNetSendConnectInfo {
   void* reqFifo;
 };
 
+#define COLLNET_GROUP_NSUBS 8
+#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED   0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+  ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+  (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+   (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+    int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+    if ((shared) == 0) { \
+      if (dev) { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+      } else { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+      } \
+    } else { \
+      (mapStruct)->offsets.offsetName = bank; \
+    } \
+} while (0);
+
+struct connectMapMem{
+  char* gpuPtr;
+  char* cpuPtr;
+  int size;
+};
+
+struct connectMap {
+  int shared;
+  // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+  struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+  // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+  struct {
+    uint32_t sendMem;
+    uint32_t recvMem;
+    uint32_t buffs[NCCL_NUM_PROTOCOLS];
+  } offsets;
+};
+
 struct reqSlot {
   volatile void* recvBuff;
   volatile int size;
 };
 
-struct collNetSendResources {
-  struct ncclComm* comm;
+struct sendResources {
+  struct connectMap map;
   void* collNetComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int nranks;
   int netDev;
   int useGdr;
+  uint64_t* gdcSync;
+  void* gdrDesc;
   void* sendMhandles[NCCL_NUM_PROTOCOLS];
   void* recvMhandles[NCCL_NUM_PROTOCOLS];
-  struct ncclRecvMem* devRecvMem;
   uint64_t step;
-  uint64_t llLastCleaning;
   struct reqSlot (*reqFifo)[NCCL_STEPS];
   int collNetRank;
   uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };
 
-struct collNetRecvResources {
-  struct ncclComm* comm;
+struct recvResources {
+  struct connectMap map;
   void* collNetComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int nranks;
   int netDev;
   int useGdr;
+  uint64_t* gdcSync;
+  uint64_t* gdcFlush;
+  void* gdrDesc;
   void* mhandles[NCCL_NUM_PROTOCOLS];
-  struct ncclRecvMem* devRecvMem;
   uint64_t step;
-  uint64_t llLastCleaning;
   struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
   int collNetRank;
   uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };
 
-struct collNetSharedResources {
-  void* collNetListenComms[MAXCHANNELS];
-  void* collNetComms[MAXCHANNELS];
-  int collNetCommRefCount[MAXCHANNELS];
-};
-
 /* Determine if we can communicate with the peer */
-ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = 1;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
+struct setupReq {
+  int netDev;
+  int useGdr;
+};
+
+
+/* Setup send connector, and return connect information for others in the coll
+ * communicator to connect to me */
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+  struct setupReq req;
+
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+  struct setupReq req;
+
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+  recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
+  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
+
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetDumpMap(struct connectMap* map) {
+  printf("Dump map\n");
+  struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+  printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_DEVMEM;
+  printf("Mem 1: Vid  mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+  printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+  printf("Mem 3: Shared Vid  (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+  printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+        map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+        NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+        NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
+  }
+  printf("End of dump\n");
+  return ncclSuccess;
+}
+
+struct collNetConnectArgs {
+  int rank;
+  int nranks;
+  struct ncclConnect* connectInfos;
+};
+
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+  // We're on the same process as the proxy. We can pass a pointer to a struct.
+  struct collNetConnectArgs args = { rank, nranks, connectInfos };
+  struct connectMap* map;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+
+  // If collnet connect failed, propagate error to fallback on regular p2p
+  if (map == NULL) return ncclSystemError;
+
+  //NCCLCHECK(collNetDumpMap(map));
+
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
+
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  send->conn.tail = &recvMem->tail;
+  send->conn.sizesFifo = recvMem->sizesFifo;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+  send->conn.offsFifo = recvMem->offsFifo;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  return ncclSuccess;
+}
+
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+  // We're on the same process as the proxy. We can pass a pointer to a struct.
+  struct collNetConnectArgs args = { rank, nranks, connectInfos };
+  struct connectMap* map;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+
+  // If collnet connect failed, propagate error to fallback on regular p2p
+  if (map == NULL) return ncclSystemError;
+
+  //NCCLCHECK(collNetDumpMap(map));
+
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  recv->conn.head = &sendMem->head;
+
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+  recv->conn.offsFifo = recvMem->offsFifo;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t sendFree(struct ncclConnector* send) {
+  return ncclSuccess;
+}
+
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*)reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+  struct sendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+  connection->shared = 1;
+
+  resources->netDev = req->netDev;
+  resources->useGdr = req->useGdr;
+  return ncclSuccess;
+}
+
+struct sharedResources {
+  void* collNetListenComms[MAXCHANNELS];
+  void* collNetComms[MAXCHANNELS];
+  int commRefCount[NCCL_MAX_NETDEVS];
+};
+
+ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
   if (resources == NULL) {
     NCCLCHECK(ncclCalloc(&resources, 1));
-    comm->proxyState.sharedBuffs.collNetResources = resources;
+    comm->proxyState.progressState.collNet.resources = resources;
   }
   if (resources->collNetComms[netDev] == NULL)
     NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
   return ncclSuccess;
 }
 
-/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct collNetSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  send->conn.shared = 1;
-  resources->comm = comm;
-
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
-
-  send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1;
-
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-
-  int recvSize = offsetof(struct ncclRecvMem, buff);
-  // Simple uses shared buffers and we don't support LL128
-  recvSize += send->comm->buffSizes[NCCL_PROTO_LL];
-
-  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr));
-    CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
-    send->conn.curr_hdp_reg = resources->curr_hdp_reg;
-  }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
-
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
-  return ncclSuccess;
-}
-
-/* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct collNetRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-  recv->conn.shared = 1;
-  resources->comm = comm;
-
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
-
-  recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev;
-
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-
-  int recvSize = offsetof(struct ncclRecvMem, buff);
-  // Simple uses shared buffers and we don't support LL128
-  recvSize += recv->comm->buffSizes[NCCL_PROTO_LL];
-
-  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr));
-  }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
-
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
-  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
-
-  NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
-  return ncclSuccess;
-}
-
-ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
+static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
   if (resources->collNetComms[netDev] == NULL) {
     // Connect to coll comm
     collNetHandle_t** handlePtrs = NULL;
@@ -159,157 +318,255 @@ ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct nccl
           resources->collNetListenComms[netDev],
           resources->collNetComms+netDev);
     free(handlePtrs);
-    NCCLCHECK(ret);
-    // Close listen comm
-    NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
+    if (ret == ncclSuccess) {
+      // Close listen comm
+      NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
+    } else {
+      resources->collNetListenComms[netDev] = NULL;
+    }
   }
   *collNetComm = resources->collNetComms[netDev];
-  resources->collNetCommRefCount[netDev]++;
+  if (*collNetComm) resources->commRefCount[netDev]++;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
-  // Setup device pointers
-  struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
-  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
+static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+  resources->commRefCount[netDev]--;
+  if (resources->commRefCount[netDev] == 0) {
+    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+  }
+  for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
+  comm->proxyState.progressState.collNet.resources = NULL;
+  free(resources);
+  return ncclSuccess;
+}
 
-  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
-  send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff;
-  send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
-  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
+  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+  if (state->size == 0) {
+    state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
+  }
 
-  // Head/Tail/Opcount/Fifos are always on host
-  send->conn.tail = &resources->recvMem->tail;
-  send->conn.sizesFifo = resources->recvMem->sizesFifo;
-  send->conn.ptrsFifo = resources->recvMem->ptrsFifo;
-  send->conn.head = &resources->sendMem->head;
-  resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+  *size = state->size;
+
+  if (cuda && state->cudaBuff == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda));
+  }
+  if (!cuda && state->hostBuff == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+  }
+  *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
+  // Use different pools for different channels and also separate send/recv.
+  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
+  *offset = slotSize * globalSlot;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
+  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+  if (state->size == 0) return ncclSuccess;
+  CUDACHECK(hipFree(state->cudaBuff));
+  NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+  // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
+  state->size = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*)reqBuff;
+  if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
+
+  struct recvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+  connection->shared = 1;
+
+  resources->netDev = req->netDev;
+  resources->useGdr = req->useGdr;
+
+  collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
+  if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
+
+  NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
+  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
 
   // Get info from recv side
-  resources->collNetRank = rank;
+  resources->collNetRank = args->rank;
   resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo);
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     resources->recvMhandles[p] = info->mhandles[p];
 
-  NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+
+  // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
+  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  if (resources->collNetComm == NULL) {
+    *((struct connectMap**)respBuff) = NULL;
+    return ncclSuccess;
+  }
+  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
+
+  struct connectMap* map = &resources->map;
+
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+    resources->gdcSync = cpuPtr;
+    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+    gdcMem->cpuPtr = (char*)cpuPtr;
+    gdcMem->gpuPtr = (char*)gpuPtr;
+    gdcMem->size = sizeof(uint64_t); // sendMem->head
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  // Don't give credits yet in shared mode.
+  resources->sendMem->head = -NCCL_STEPS;
 
-  int size;
-  char* ptr;
   // Allocate & Register shared buffers for the Simple protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+  int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+  struct connectMapMem* mapMem = map->mems+bank;
+  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
         &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
 
-  // Allocate & Register shared buffers for the LL protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
-        NCCL_PTR_HOST,
-        &resources->sendMhandles[NCCL_PROTO_LL]));
+  *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
-  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
-  resources->collNetRank = rank;
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
-  // Intermediate buffering on GPU for GPU Direct RDMA
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
-  int offset = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
-    offset += recv->comm->buffSizes[p];
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+  resources->collNetRank = args->rank;
+
+  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+
+  // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
+  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  if (resources->collNetComm == NULL) {
+    *((struct connectMap**)respBuff) = NULL;
+    return ncclSuccess;
   }
-  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
 
-  // Head/Tail/Opcount are always on host
-  recv->conn.tail = &resources->recvMem->tail;
-  recv->conn.ptrsFifo = resources->recvMem->ptrsFifo;
-  recv->conn.head = &resources->sendMem->head;
+  struct connectMap* map = &resources->map;
 
-  NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
-  int size;
-  char* ptr;
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+    if (ncclParamGdrCopySyncEnable()) {
+      resources->gdcSync = cpuPtr;
+      struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+      gdcMem->cpuPtr = (char*)cpuPtr;
+      gdcMem->gpuPtr = (char*)gpuPtr;
+      gdcMem->size = sizeof(uint64_t);
+    }
+    if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
 
   // Allocate & Register shared buffers for the Simple protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+  int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+  struct connectMapMem* mapMem = map->mems+bank;
+  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
         &resources->mhandles[NCCL_PROTO_SIMPLE]));
 
-  // Allocate & Register shared buffers for the LL protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
-        NCCL_PTR_HOST,
-        &resources->mhandles[NCCL_PROTO_LL]));
-
   // Pass info to send side
   info->reqFifo = resources->reqFifo;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     info->mhandles[p] = resources->mhandles[p];
 
+  if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
-  resources->collNetCommRefCount[netDev]--;
-  if (resources->collNetCommRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->sendMhandles[p]) {
+      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+    }
   }
-  for (int c=0; c<MAXCHANNELS; c++) if (resources->collNetCommRefCount[c]) return ncclSuccess;
-  comm->proxyState.sharedBuffs.collNetResources = NULL;
-  free(resources);
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  NCCLCHECK(sharedBuffersDestroy(comm));
+  NCCLCHECK(sharedFree(comm, resources->netDev));
+  free(connection->transportResources);
   return ncclSuccess;
 }
 
-ncclResult_t collNetSendFree(void* sendTransportResources) {
-  struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  if (resources->collNetComm) {
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL]));
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->mhandles[p]) {
+      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+    }
   }
-  if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem));
-
-  NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
-  free(resources);
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  NCCLCHECK(sharedBuffersDestroy(comm));
+  NCCLCHECK(sharedFree(comm, resources->netDev));
+  free(connection->transportResources);
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvFree(void* recvTransportResources) {
-  struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  if (resources->collNetComm) {
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL]));
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
-  }
-  if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem));
-
-  NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
-  free(resources);
-  return ncclSuccess;
-}
 
 #define LAST_OF_GROUP(s) \
   (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
 
-ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
-  if (args->protocol == NCCL_PROTO_LL128) {
-    WARN("CollNet does not support LL128");
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->protocol != NCCL_PROTO_SIMPLE) {
+    WARN("CollNet does not support LL/LL128");
     return ncclInternalError;
   }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
@@ -325,23 +582,21 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       void* sendMhandle = resources->sendMhandles[p];
       void* recvMhandle = resources->recvMhandles[p];
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
       auto reqFifo = resources->reqFifo;
       if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        if (p == NCCL_PROTO_SIMPLE) {
-          char* ptr;
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr));
-          resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize;
-          __sync_synchronize();
-        }
-        volatile uint64_t* sendHead = &resources->sendMem->head;
+        int sharedBuffSlot = sub->posted%NCCL_STEPS;
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+        resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
+        __sync_synchronize();
+        volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
         sub->posted += args->sliceSteps;
         *sendHead = sub->base + sub->posted - NCCL_STEPS;
+        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
       }
       // Enforce sync between operations of the same group.
       bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
@@ -350,30 +605,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
         int sharedBuffSlot = sub->received%NCCL_STEPS;
         volatile int* sizesFifo = resources->recvMem->sizesFifo;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) {
+        char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
+        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
           // We have something to receive, let's check whether data is ready.
-          int size = sizesFifo[buffSlot];
           int ready = 1;
           if (s == 0) {
-            NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot]));
-            args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2;
-          }
-          if (p == NCCL_PROTO_LL) {
-            char* localBuff = sub->connector->conn.buffs[p];
-            uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1);
-            int nFifoLines = size / sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            // Pack data into the shared buffer
-            uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s);
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *d1 = &lines[i].data1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              volatile uint32_t *d2 = &lines[i].data2;
-              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
-              sendBuff[2*i] = d1[0];
-              sendBuff[2*i+1] = d2[0];
-            }
+            int offset;
+            NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+            args->sharedBuff[sharedBuffSlot] = localBuff + offset;
+            args->sharedSize[sharedBuffSlot] = args->chunkSize;
           }
           if (ready) {
             sizesFifo[buffSlot] = -1;
@@ -437,15 +677,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
-  if (args->protocol == NCCL_PROTO_LL128) {
-    WARN("CollNet does not support LL128");
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->protocol != NCCL_PROTO_SIMPLE) {
+    WARN("CollNet does not support LL/LL128");
     return ncclInternalError;
   }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
@@ -460,19 +700,20 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
       void* mhandle = resources->mhandles[p];
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
       auto reqFifo = resources->reqFifo;
+      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+
       // Enforce sync between operations of the same group.
       if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        char* ptr;
         int sharedBuffSlot = sub->posted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
-        NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &ptr));
-        reqFifo[group][buffSlot].recvBuff = ptr;
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        reqFifo[group][buffSlot].recvBuff = localBuff + offset;
         TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
         sub->posted += args->sliceSteps;
         args->idle = 0;
@@ -487,11 +728,24 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
           int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
           TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
           sub->received += args->sliceSteps;
-          if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
-            int startChannel = group*COLLNET_GROUP_NSUBS;
-            char* groupRecvAddress;
-            NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, 1, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
-            NCCLCHECK(collNetIflush(resources->collNetComm, groupRecvAddress, totalSize, mhandle, sub->requests+buffSlot));
+          sub->requests[buffSlot] = NULL;
+          if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
+            // GDRCOPY support
+            if (resources->gdcFlush) {
+#if defined (__x86_64__)
+              // Force a PCI-E read from GPU memory
+              asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
+#else
+              WARN("NET: GDR Flush only supported on x86_64");
+              return ncclInternalError;
+#endif
+              sub->requests[buffSlot] = NULL;
+            } else {
+              int startChannel = group*COLLNET_GROUP_NSUBS;
+              int offset;
+              NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+              NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+            }
           } else {
             for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
           }
@@ -517,27 +771,14 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
         int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
         int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
-        char* groupRecvAddress;
-        NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
-        char* ptr = groupRecvAddress + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot];
-        if (p == NCCL_PROTO_SIMPLE) {
-          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-          ptrsFifo[buffSlot] = ptr;
-          __sync_synchronize();
-          resources->recvMem->tail = sub->base + sub->flushed;
-        }
-        if (p == NCCL_PROTO_LL) { // ll
-          // re-attach flag
-          char* localBuff = sub->connector->conn.buffs[p];
-          uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1);
-          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-          uint32_t* recvData = (uint32_t*)ptr;
-          int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t));
-          for (int i=0; i<nFifoLines; i++) {
-            lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
-            lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
-          }
-        }
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+        offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
+        __sync_synchronize();
+        volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+        *recvTail = sub->base + sub->flushed;
+        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         sub->transmitted += args->sliceSteps;
         args->idle = 0;
         continue;
@@ -562,7 +803,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
 
 struct ncclTransport collNetTransport = {
   "COL",
-  collNetCanConnect,
-  { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
-  { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+  canConnect,
+  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
 };
diff --git a/src/transport/net.cc b/src/transport/net.cc
index be459840ca..b7b8b753b4 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,62 +9,135 @@
 #include "net.h"
 #include "graph.h"
 #include <sys/time.h>
+#include "proxy.h"
 #include "collectives.h"
-#include <hsa/hsa_ext_amd.h>
 #include "gdrwrap.h"
+#include "shm.h"
+#include "profiler.h"
 
-struct netConnectInfo {
-  ncclNetHandle_t netHandle;
+static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED   0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+  ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+  (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+   (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+    int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+    if ((shared) == 0) { \
+      if (dev) { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+      } else { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+      } \
+    } else { \
+      (mapStruct)->offsets.offsetName = bank; \
+    } \
+} while (0);
+
+struct connectMapMem{
+  char* gpuPtr;
+  char* cpuPtr;
+  int size;
+  union {
+    char shmPath[PATH_MAX];
+    hipIpcMemHandle_t ipc;
+  };
 };
 
-#define LOC_HOSTMEM 0
-#define LOC_DEVMEM  1
-#define LOC_COUNT   2
+struct connectMap {
+  int sameProcess;
+  int shared;
+  int cudaDev;
+  // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+  struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+  // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+  struct {
+    uint32_t sendMem;
+    uint32_t recvMem;
+    uint32_t buffs[NCCL_NUM_PROTOCOLS];
+  } offsets;
+};
 
-struct netSendResources {
+struct sendResources {
+  struct connectMap map;
   void* netSendComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int localRank;
+  int remoteRank;
   int netDev;
   int useGdr;
+  int maxRecvs;
+  uint64_t* gdcSync;
+  void* gdrDesc;
   int shared;
-  char* buffers[LOC_COUNT];
-  int buffSizes[LOC_COUNT];
-  void* mhandles[LOC_COUNT];
-  void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+  int channelId;
+  int connIndex;
+  char* buffers[NCCL_NUM_PROTOCOLS];
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
   uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };
 
-struct netRecvResources {
+struct recvResources {
+  struct connectMap map;
   void* netListenComm;
   void* netRecvComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
 
-  // GDRCOPY support
-  void* gdrMemDesc;
-  struct ncclRecvMem* devRecvMem;
-  void* gdrFlushDesc;
-  int* devFlushMem;
-
+  int rank;
+  int localRank;
+  int remoteRank;
+  int proxyRank;
   int netDev;
   int useGdr;
+  int maxRecvs;
+  uint64_t* gdcSync;
+  uint64_t* gdcFlush;
+  void* gdrDesc;
   int shared;
-  char* buffers[LOC_COUNT];
-  int buffSizes[LOC_COUNT];
-  void* mhandles[LOC_COUNT];
-  void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+  int channelId;
+  int connIndex;
+  char* buffers[NCCL_NUM_PROTOCOLS];
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
   uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };
 
-NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2);
+NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
 
 /* Determine if two peers can communicate with NET */
-ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   // Same host?
   if (info1->hostHash == info2->hostHash) {
     // User disabled NET for intra-node?
@@ -78,259 +151,641 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 }
 
 NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
+
+struct setupReq {
+  int rank;
+  int localRank;
+  int remoteRank;
+  int shared;
+  int netDev;
+  int useGdr;
+  int channelId;
+  int connIndex;
+};
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct netSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
-  send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend;
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+  struct setupReq req;
 
-  resources->netDev = -1;
-  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &resources->netDev));
-  if (resources->netDev < 0) {
-    // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-    int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
-    NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
+  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
+  req.netDev = -1;
+
+  int proxyRank = myInfo->rank;
+  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev));
+  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
+  req.rank = myInfo->rank;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+  req.remoteRank = peerInfo->rank;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+
+  if (proxyRank == myInfo->rank) {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  } else {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
-
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
-
-  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-  send->conn.tail = &resources->recvMem->tail;
-  send->conn.sizesFifo = resources->recvMem->sizesFifo;
-  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
-  send->conn.head = &resources->sendMem->head;
-  resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
-
-  if (resources->shared == 0) {
-    int protoLoc[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    }
-    int buffSizes[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      buffSizes[p] = send->comm->buffSizes[p];
-      resources->buffSizes[protoLoc[p]] += buffSizes[p];
-    }
-
-    if (resources->buffSizes[LOC_DEVMEM]) {
-      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
-    }
-    if (resources->buffSizes[LOC_HOSTMEM]) {
-      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-    }
-
-    int offsets[LOC_COUNT];
-    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-      send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-      offsets[protoLoc[p]] += buffSizes[p];
-    }
-  }
-
-  if (resources->useGdr) {
-    CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
-    send->conn.curr_hdp_reg = resources->curr_hdp_reg;
-  }
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, 
-		  peerInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", 
-		  resources->shared ? "/Shared" : "", comm, comm->nRanks);
+  *((int*)connectInfo) = proxyRank;
   return ncclSuccess;
 }
 
 // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory
-NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1);
+NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1);
 // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers
 NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 
-ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct netRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-  recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
-  recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend;
+/* Setup recv connector */
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+  struct setupReq req;
 
-  resources->netDev = -1;
-  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
-  if (resources->netDev < 0) {
-    // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-    int nicRR = comm->cudaDev;
-    NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
-  }
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
+  req.netDev = -1;
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
+  // Use myInfo->rank as the receiver uses its own NIC
+  int proxyRank = myInfo->rank;
+  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev));
+  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
 
-  // GDRCOPY tail support
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) {
-    struct ncclRecvMem* devCudaPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc));
-    // The GDR mapped VA doesn't work on the SMs
-    recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail;
-  } else {
-    recv->conn.tail = &resources->recvMem->tail;
-  }
+  // We don't support PXN on receive yet
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
 
-  // GDRCOPY flush support
-#if defined (__x86_64__)
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) {
-    int* cudaPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc));
-  }
-#endif
-
-  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
-  recv->conn.head = &resources->sendMem->head;
-
-  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
-    int protoLoc[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    }
-
-    int buffSizes[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      buffSizes[p] = recv->comm->buffSizes[p];
-      resources->buffSizes[protoLoc[p]] += buffSizes[p];
-    }
-
-    if (resources->buffSizes[LOC_DEVMEM]) {
-      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
-    }
-    if (resources->buffSizes[LOC_HOSTMEM]) {
-      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-    }
-
-    int offsets[LOC_COUNT];
-    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-      recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-      offsets[protoLoc[p]] += buffSizes[p];
-    }
-  }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, peerInfo->rank, 
-		  peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", 
-		  resources->shared ? "/Shared" : "", comm, comm->nRanks);
-  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
-  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+  req.rank = myInfo->rank;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+  req.remoteRank = peerInfo->rank;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
 
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
-ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t netMapShm(struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
+  NCCLCHECK(ncclShmUnlink(mem->shmPath));
+  return ncclSuccess;
+}
+static ncclResult_t netCreateShm(struct connectMapMem* mem) {
+  mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
+  return ncclSuccess;
+}
+
+static ncclResult_t netDumpMap(struct connectMap* map) {
+  printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
+  struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+  printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_DEVMEM;
+  printf("Mem 1: Vid  mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+  printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+  printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+  printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+        map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+        NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+        NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
+  }
+  printf("End of dump\n");
+  return ncclSuccess;
+}
+
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
-  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
-  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+  struct connectMap* map;
+  NCCLCHECK(ncclCalloc(&map, 1));
+  send->transportResources = map;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
 
-  // Connect to remote peer
-  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+  if (map->sameProcess) {
+    if (map->cudaDev != comm->cudaDev) {
+      // Enable P2P access
+      hipError_t err = hipDeviceEnablePeerAccess(map->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
+        WARN("failed to peer with device %d: %d %s", map->cudaDev, err, hipGetErrorString(err));
+        return ncclInternalError;
+      }
+    }
+  } else {
+    NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+    if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      CUDACHECK(hipIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, hipIpcMemLazyEnablePeerAccess));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
+    }
+    if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
+      void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
+      if (*sharedDevMemPtr == NULL) {
+        CUDACHECK(hipIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, hipIpcMemLazyEnablePeerAccess));
+      }
+      map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
+      map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
+    }
+  }
+  //NCCLCHECK(netDumpMap(map));
 
-  if (resources->shared) {
-    // Get shared buffers
-    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
-    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
-  }
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
 
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
-  }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
-  }
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  send->conn.tail = &recvMem->tail;
+  send->conn.sizesFifo = recvMem->sizesFifo;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
   return ncclSuccess;
 }
 
 /* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+  struct connectMap* map;
+  NCCLCHECK(ncclCalloc(&map, 1));
+  recv->transportResources = map;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
+  //NCCLCHECK(netDumpMap(map));
 
-  // Finish connection establishment from remote peer
-  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
-  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  recv->conn.head = &sendMem->head;
 
-  if (resources->shared) {
-    // Get shared buffers
-    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
-    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
-  }
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+  recv->conn.sizesFifo = recvMem->sizesFifo;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
 
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
-  }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  return ncclSuccess;
+}
+
+static ncclResult_t sendFree(struct ncclConnector* send) {
+  struct connectMap* map = (struct connectMap*)(send->transportResources);
+  if (map->sameProcess == 0) {
+    NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+    if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      CUDACHECK(hipIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+    }
   }
   return ncclSuccess;
 }
 
-ncclResult_t netSendFree(void* transportResources) {
-  struct netSendResources* resources = (struct netSendResources*)transportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  for (int l=0; l<LOC_COUNT; l++) {
-    if (resources->buffers[l])
-      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+  return ncclSuccess;
+}
+
+#define NCCL_SHARED_STEPS 16
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
+    int nChannels, char** gpuPtr, char** cpuPtr, int* size, hipIpcMemHandle_t* ipc) {
+  if (cuda == 0 && sameProcess == 0) {
+      WARN("PXN should not use host buffers for data");
+      return ncclInternalError;
   }
-  if (resources->shared == 0) {
-    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-    CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
+  struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+  if (progressState->localPeers == NULL) {
+    NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+  }
+  struct ncclProxyPeer** localPeers = progressState->localPeers;
+  if (localPeers[localRank] == NULL) {
+    NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
+  }
+  struct ncclProxyPeer* peer = localPeers[localRank];
+  struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+  state->refcount++;
+  if (state->size == 0) {
+    state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
+  }
+
+  if (size) *size = state->size;
+
+  if (cuda && state->cudaBuff == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, cuda));
+    if (sameProcess == 0) {
+      CUDACHECK(hipIpcGetMemHandle(&state->ipc, state->cudaBuff));
+    }
+  }
+  if (!cuda && state->hostBuff == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
+  }
+  if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+  if (sameProcess) {
+    if (gpuPtr) *gpuPtr = *cpuPtr;
+  } else {
+    if (gpuPtr) *gpuPtr = NULL;
+    if (ipc) memcpy(ipc, &state->ipc, sizeof(hipIpcMemHandle_t));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
+  // Use different pools for different channels and also separate send/recv.
+  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
+  int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
+  *offset = slotSize * globalSlot;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
+  if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
+  struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
+  if (peer == NULL) NCCLCHECK(ncclInternalError;)
+  struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+  if (state->size == 0) NCCLCHECK(ncclInternalError);
+  state->refcount--;
+  if (state->refcount == 0) {
+    if (state->cudaBuff) CUDACHECK(hipFree(state->cudaBuff));
+    if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+  }
+  if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
+  free(peer);
+  comm->proxyState.progressState.localPeers[localRank] = NULL;
+  for (int r=0; r<comm->localRanks; r++) {
+    if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
+  }
+  // All peers are freed, free array
+  free(comm->proxyState.progressState.localPeers);
+  comm->proxyState.progressState.localPeers = NULL;
+  return ncclSuccess;
+}
+
+static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
+  int rank = comm->localRankToRank[connection->localRank];
+  int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*) reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+  struct sendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+
+  resources->rank = req->rank;
+  resources->localRank = req->localRank;
+  resources->remoteRank = req->remoteRank;
+  resources->netDev = req->netDev;
+  resources->shared = connection->shared = req->shared;
+  resources->useGdr = req->useGdr;
+  resources->channelId = req->channelId;
+  resources->connIndex = req->connIndex;
+  ncclNetProperties_t props;
+  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  resources->maxRecvs = props.maxRecvs;
+
+  // We don't return any data
+  if (respSize != 0) return ncclInternalError;
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*) reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+  struct recvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+
+  resources->rank = req->rank;
+  resources->localRank = req->localRank;
+  resources->remoteRank = req->remoteRank;
+  resources->netDev = req->netDev;
+  resources->shared = connection->shared = req->shared;
+  resources->useGdr = req->useGdr;
+  resources->channelId = req->channelId;
+  resources->connIndex = req->connIndex;
+  ncclNetProperties_t props;
+  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  resources->maxRecvs = props.maxRecvs;
+
+  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
+  NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
+
+  if (resources->shared) {
+    // Shared buffers
+    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    if (progressState->localPeers == NULL) {
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+    }
+    struct ncclProxyPeer** localPeers = progressState->localPeers;
+    if (localPeers[resources->localRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    }
+    connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
+
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      // Connect or reuse connection for a netdev/remote rank.
+      if (progressState->netComms[resources->netDev] == NULL) {
+        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+      }
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
+      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      resources->netSendComm = comms->sendComm[resources->channelId];
+      if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
+    } else {
+      NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    }
+  } else {
+    // Connect to remote peer
+    NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    connection->proxyAppendPtr = &connection->proxyAppend;
+  }
+  if (resources->netSendComm == NULL) {
+    *done = 0;
+    return ncclSuccess;
+  }
+  *done = 1;
+
+  // Create structures
+  struct connectMap* map = &resources->map;
+  map->sameProcess =
+    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  map->shared = resources->shared;
+  CUDACHECK(hipGetDevice(&map->cudaDev));
+
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = comm->buffSizes[p];
+    }
+  } else {
+    // Get shared buffers
+    int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+    struct connectMapMem* mapMem = map->mems+bank;
+    NCCLCHECK(sharedBuffersInit(
+          comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
+          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
+    resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  }
+
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+    if (resources->shared == 0) {
+      if (!map->sameProcess) {
+        ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
+      }
+      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+    }
+    if (!map->sameProcess) {
+      CUDACHECK(hipIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+    }
+  }
+  if (map->sameProcess) {
+    NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+    map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  } else {
+    NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+  }
+  if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+    resources->gdcSync = cpuPtr;
+    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+    gdcMem->cpuPtr = (char*)cpuPtr;
+    gdcMem->gpuPtr = (char*)gpuPtr;
+    gdcMem->size = sizeof(uint64_t); // sendMem->head
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+
+  // Don't give credits yet in shared mode.
+  resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
+  for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+    }
+  }
+
+  //NCCLCHECK(netDumpMap(map));
+  if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+  memcpy(respBuff, map, sizeof(struct connectMap));
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(int)) return ncclInternalError;
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  resources->proxyRank = *(int*)reqBuff;
+
+  // Finish connection establishment from remote peer
+  if (resources->shared) {
+    // Shared buffers
+    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    if (progressState->localPeers == NULL) {
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+    }
+    struct ncclProxyPeer** localPeers = progressState->localPeers;
+    if (localPeers[resources->localRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    }
+    connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
+
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      // Connect or reuse connection for a netdev/remote rank.
+      if (progressState->netComms[resources->netDev] == NULL) {
+        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+      }
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
+      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+      resources->netRecvComm = comms->recvComm[resources->channelId];
+      if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
+    } else {
+      NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    }
+  } else {
+    // Connect to remote peer
+    NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    connection->proxyAppendPtr = &connection->proxyAppend;
+  }
+  if (resources->netRecvComm == NULL) {
+    *done = 0;
+    return ncclSuccess;
+  }
+  *done = 1;
+  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+
+  // Create structures
+  struct connectMap* map = &resources->map;
+  map->sameProcess =
+    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
+  map->shared = resources->shared;
+
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = comm->buffSizes[p];
+    }
+  } else {
+    // Get shared buffers
+    int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+    struct connectMapMem* mapMem = map->mems+bank;
+    NCCLCHECK(sharedBuffersInit(
+          comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
+          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
+    resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  }
+
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+    if (resources->shared == 0) {
+      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+    }
+  }
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy && map->sameProcess) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+    if (ncclParamGdrCopySyncEnable()) {
+      resources->gdcSync = cpuPtr;
+      struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+      gdcMem->cpuPtr = (char*)cpuPtr;
+      gdcMem->gpuPtr = (char*)gpuPtr;
+      gdcMem->size = sizeof(uint64_t);
+    }
+    if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+    }
+  }
+
+  //NCCLCHECK(netDumpMap(map));
+  if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+  memcpy(respBuff, map, sizeof(struct connectMap));
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  if (resources == NULL) { // NVB Preconnect
+    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
+    return ncclSuccess;
+  }
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+    }
+  }
+  struct connectMapMem* mems = resources->map.mems;
+  if (resources->map.sameProcess) {
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  } else {
+    NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
+  }
+  CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  if (resources->shared) {
+    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
+      comms->sendRefCount[resources->channelId]--;
+      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+    } else {
+      NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+    }
+  } else {
+    NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   }
-  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t netRecvFree(void* transportResources) {
-  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
-  // GDRCOPY support
-  if (resources->gdrFlushDesc) {
-    NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  if (resources == NULL) { // NVB Preconnect
+    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
+    return ncclSuccess;
   }
-  // GDRCOPY support
-  if (resources->gdrMemDesc) {
-    NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+    }
   }
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  for (int l=0; l<LOC_COUNT; l++) {
-    if (resources->buffers[l])
-      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  if (resources->shared) {
+    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
+      comms->recvRefCount[resources->channelId]--;
+      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+    } else {
+      NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+    }
+  } else {
+    NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   }
-  if (resources->shared == 0) {
-    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-    CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
-  }
-  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   free(resources);
   return ncclSuccess;
 }
 
 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
 
-ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->transmitted = sub->done = 0;
+      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
     }
     args->state = ncclProxyOpProgress;
     args->hdp_flushed = 0;
@@ -338,29 +793,33 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
+    int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
       if (sub->done == sub->nsteps) continue;
-      struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
-      void* mhandle = *(resources->mhandlesProto[p]);
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
-      char* localBuff = sub->connector->conn.buffs[p];
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
+      void* mhandle = resources->mhandles[p];
+      int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
       int buffSize = stepSize*args->sliceSteps;
-      if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
-      if (sub->sendbytes < buffSize) buffSize = sub->sendbytes;
+      if (sub->nbytes < buffSize) buffSize = sub->nbytes;
       // Post buffers to the GPU
-      if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
+      if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         if (resources->shared) {
-          char* ptr;
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr));
-          resources->recvMem->ptrsFifo[buffSlot] = ptr;
+          int sharedBuffSlot = sub->posted%maxDepth;
+          int offset;
+          NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
+          resources->recvMem->offsFifo[buffSlot] = offset;
           __sync_synchronize();
-          volatile uint64_t* sendHead = &resources->sendMem->head;
+          volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
           sub->posted += args->sliceSteps;
           *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         } else sub->posted += args->sliceSteps;
+        for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
+          ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
+        }
         args->idle = 0;
         continue;
       }
@@ -372,7 +831,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
         if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
           // We have something to receive, let's check if it's completely ready.
           int size = sizesFifo[buffSlot];
-          char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+          char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
           int ready = 1;
           if (p == NCCL_PROTO_LL128) {
             ready = resources->useGdr;
@@ -404,22 +863,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
               STORE(resources->curr_hdp_reg, 1);
             }
             // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
-#ifdef ENABLE_PROFILING
-              if (sub->channel->active_req == 0) {
-                gettimeofday(&sub->channel->tvs, NULL);
-                sub->channel->sizes = 0;
-              }
-              sub->channel->active_req ++;
-              sub->channel->sizes += LOAD(sizesFifo+buffSlot);
-              sub->channel->send_byte += LOAD(sizesFifo+buffSlot);
-#endif
-              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
               sizesFifo[buffSlot] = -1;
               // Make sure size is reset to zero before we update the head.
               __sync_synchronize();
               sub->transmitted += args->sliceSteps;
+              for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
               args->idle = 0;
               continue;
             }
@@ -432,29 +883,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
         NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
         if (done) {
-          TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
-#ifdef ENABLE_PROFILING
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            sub->channel->active_req --;
-            if (sub->channel->active_req == 0) {
-              struct timeval tv;
-              gettimeofday(&tv, NULL);
-              float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec;
-              if (delta) {
-#ifdef ENABLE_TIMING_PROFILE
-                sub->channel->bw_cumulative += (float)delta/1E3;
-#else
-                sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3;
-#endif
-                sub->channel->bw_count ++;
-              }
-            }
-          }
-#endif
+          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
+          for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
 
           if (resources->shared == 0) {
-            resources->sendMem->head = sub->base + sub->done;
+            volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
+            *sendHead = sub->base + sub->done;
+            if (resources->gdcSync) wc_store_fence(); // Flush out WC write
           }
           args->idle = 0;
           if (sub->done == sub->nsteps) {
@@ -471,140 +907,203 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
+    // Initialize subs and group them by same recvComm.
+    void* recvComm;
+    int groupSize = 0;
+    int maxRecvs = 1;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
+      if (groupSize == maxRecvs) {
+        groupSize = 0;
+      } else if (s>0) { // Find next sub with the same recvComm
+        int next;
+        for (next=s; next<args->nsubs; next++) {
+          struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources);
+          if (nextRes->netRecvComm == recvComm) break;
+        }
+        if (next == args->nsubs) { // Not found
+          groupSize = 0;
+        } else if (s != next) { // We found a sub later with the same recvComm ; swap subs
+          struct ncclProxySubArgs temp;
+          memcpy(&temp, sub, sizeof(struct ncclProxySubArgs));
+          memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs));
+          memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs));
+        }
+      }
+      groupSize++;
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+      maxRecvs = resources->maxRecvs;
+      recvComm = resources->netRecvComm;
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
+      for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
+      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
     }
     args->state = ncclProxyOpProgress;
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
-    for (int s=0; s<args->nsubs; s++) {
-      struct ncclProxySubArgs* sub = args->subs+s;
-      if (sub->done == sub->nsteps) continue;
-      struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
-      void* mhandle = *(resources->mhandlesProto[p]);
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
-      char* localBuff = sub->connector->conn.buffs[p];
-      int buffSize = stepSize*args->sliceSteps;
-      if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
-      if (sub->recvbytes < buffSize) buffSize = sub->recvbytes;
+    int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      int subCount = 0;
+      void* ptrs[NCCL_PROXY_MAX_SUBS];
+      int sizes[NCCL_PROXY_MAX_SUBS];
+      int tags[NCCL_PROXY_MAX_SUBS];
+      void* mhandles[NCCL_PROXY_MAX_SUBS];
 
-      if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) {
-        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        char* ptr;
-        if (resources->shared) {
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr));
-          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-          ptrsFifo[buffSlot] = ptr;
-        } else {
-          ptr = localBuff+buffSlot*stepSize;
-        }
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
-        if (sub->requests[buffSlot] != NULL) {
-          TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
-#ifdef ENABLE_PROFILING
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            if (sub->channel->active_req == 0) {
-              gettimeofday(&sub->channel->tvs, NULL);
-              sub->channel->sizes = 0;
-            }
-            sub->channel->active_req ++;
+      for (int i=0; i<subGroup->groupSize; i++) {
+        struct ncclProxySubArgs* sub = subGroup + i;
+        if (sub->posted < sub->nsteps) {
+          if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
+          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+          char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+          int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+          if (resources->shared) {
+            int sharedBuffSlot = sub->posted%maxDepth;
+            int offset;
+            NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
+            volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+            offsFifo[buffSlot] = offset;
+            ptrs[subCount] = localBuff+offset;
+          } else {
+            ptrs[subCount] = localBuff+buffSlot*stepSize;
           }
-#endif
-          sub->posted += args->sliceSteps;
-          args->idle = 0;
-          continue;
+          sizes[subCount] = stepSize*args->sliceSteps;
+          if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
+          tags[subCount] = resources->remoteRank;
+          mhandles[subCount] = resources->mhandles[p];
+          subCount++;
         }
       }
-      if (sub->posted > sub->received) {
-        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
-        int done, size;
-        NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size));
+      if (subCount) {
+        uint64_t step = subGroup->posted;
+        struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+        void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        if (*requestPtr) {
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup+i;
+            sub->posted += args->sliceSteps;
+            for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
+          }
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->idle == 0) return ncclSuccess;
+
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      if (subGroup->posted > subGroup->received) {
+        uint64_t step = subGroup->received;
+        int done;
+        void* ptrs[NCCL_PROXY_MAX_SUBS];
+        int sizes[NCCL_PROXY_MAX_SUBS];
+        void* mhandles[NCCL_PROXY_MAX_SUBS];
+        for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
+        NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
         if (done) {
-          sub->received += args->sliceSteps;
-#ifdef ENABLE_PROFILING
-          if (args->protocol == NCCL_PROTO_SIMPLE) {
-            sub->channel->active_req --;
-            sub->channel->sizes += size;
-            sub->channel->recv_byte += size;
-            if (sub->channel->active_req == 0) {
-              struct timeval tv;
-              gettimeofday(&tv, NULL);
-              float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec;
-              if (delta) {
-#ifdef ENABLE_TIMING_PROFILE
-                sub->channel->bw_cumulative += (float)delta/1E3;
-#else
-                sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3;
-#endif
-                sub->channel->bw_count ++;
-              }
+          int useGdr = 0;
+          int totalSize = 0;
+          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup + i;
+            sub->received += args->sliceSteps;
+            for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
+            if (step < sub->nsteps) {
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              if (resources->useGdr) useGdr = 1;
             }
           }
-#endif
-          if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
-            // Don't pass data to the GPU yet, flush first.
-
+          subGroup->requests[step%NCCL_STEPS] = NULL;
+          if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) {
             // GDRCOPY support
-            if (resources->devFlushMem) {
+            struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+            if (resources->gdcFlush) {
 #if defined (__x86_64__)
               // Force a PCI-E read from GPU memory
-              asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax");
+              asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
 #else
               WARN("NET: GDR Flush only supported on x86_64");
               return ncclInternalError;
 #endif
-              sub->requests[buffSlot] = NULL;
             } else {
-              volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-              char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
-              NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot));
+              int subCount = 0;
+              for (int i=0; i<subGroup->groupSize; i++) {
+                struct ncclProxySubArgs* sub = subGroup + i;
+                if (step < sub->nsteps) {
+                  struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+                  int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+                  char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+                  int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+                  ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+                  mhandles[subCount] = resources->mhandles[p];
+                  subCount++;
+                }
+              }
+              struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+              NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
-          } else {
-            sub->requests[buffSlot] = NULL;
           }
           args->idle = 0;
-          continue;
         }
       }
-      if (sub->received > sub->transmitted) {
-        // Progress flush operations
-        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+    }
+    if (args->idle == 0) return ncclSuccess;
+
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      if (subGroup->received > subGroup->transmitted) {
+        uint64_t step = subGroup->transmitted;
         int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+        void* request = subGroup->requests[step%NCCL_STEPS];
+        if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
         if (done) {
-          sub->transmitted += args->sliceSteps;
-          __sync_synchronize();
-          if (resources->devRecvMem) {
-            // GDRCOPY support: Write updated tail directly to the device memory
-            resources->devRecvMem->tail = sub->base + sub->transmitted;
-            wc_store_fence(); // Flush out WC write
-          } else {
-            resources->recvMem->tail = sub->base + sub->transmitted;
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup + i;
+            sub->transmitted += args->sliceSteps;
+            for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
+            if (step < sub->nsteps) {
+              __sync_synchronize();
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+              *recvTail = sub->base + sub->transmitted;
+              if (resources->gdcSync) wc_store_fence(); // Flush out WC write
+            }
           }
           args->idle = 0;
-          continue;
         }
       }
-      if (sub->transmitted > sub->done) {
-        volatile uint64_t* sendHead = &resources->sendMem->head;
-        uint64_t done = *sendHead;
-        while (done > sub->base + sub->done &&
-            // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
-            sub->transmitted > sub->done) {
-          sub->done += args->sliceSteps;
-          args->idle = 0;
-          if (sub->done == sub->nsteps) {
-            resources->step = sub->base + sub->nsteps;
-            args->done++;
+    }
+    if (args->idle == 0) return ncclSuccess;
+
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      for (int i=0; i<subGroup->groupSize; i++) {
+        struct ncclProxySubArgs* sub = subGroup + i;
+        if (sub->done == sub->nsteps) continue;
+        if (sub->transmitted > sub->done) {
+          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          volatile uint64_t* sendHead = &resources->sendMem->head;
+          uint64_t done = *sendHead;
+          while (done > sub->base + sub->done &&
+              // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+              sub->transmitted > sub->done) {
+            sub->done += args->sliceSteps;
+            for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
+            args->idle = 0;
+            if (sub->done == sub->nsteps) {
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              resources->step = sub->base + sub->nsteps;
+              args->done++;
+              break;
+            }
           }
         }
       }
@@ -618,7 +1117,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 
 struct ncclTransport netTransport = {
   "NET",
-  netCanConnect,
-  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
-  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+  canConnect,
+  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
 };
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 2825a967e5..f6bd92ce06 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,26 +21,44 @@
 #include <poll.h>
 #include <sys/types.h>
 #include <unistd.h>
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 #include "ibvwrap.h"
 
 #define USE_RDMA_WRITE 1
 #define MAXNAMESIZE 64
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress ncclIbIfAddr;
+static union ncclSocketAddress ncclIbIfAddr;
+
+struct ncclIbMr {
+  uintptr_t addr;
+  int pages;
+  int refs;
+  ibv_mr *mr;
+};
+
+struct ncclIbMrCache {
+  struct ncclIbMr *slots;
+  int capacity, population;
+};
 
 static int ncclNIbDevs = -1;
-struct ncclIbDev {
+struct alignas(64) ncclIbDev {
+  pthread_mutex_t lock;
   int device;
   uint64_t guid;
   uint8_t port;
   uint8_t link;
   int speed;
   ibv_context* context;
+  int pdRefs;
+  ibv_pd* pd;
   char devName[MAXNAMESIZE];
   char* pciPath;
   int realPort;
   int maxQp;
+  struct ncclIbMrCache mrCache;
 };
 
 #define MAX_IB_PORT 15
@@ -53,6 +71,7 @@ struct userIbDev {
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct userIbDev userIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+static int ncclIbRelaxedOrderingEnabled = 0;
 
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
 NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
@@ -62,6 +81,7 @@ NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
+NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -115,17 +135,28 @@ static int ncclIbSpeed(int speed) {
   return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
 }
 
+// Determine whether RELAXED_ORDERING is enabled and possible
+static int ncclIbRelaxedOrderingCapable(void) {
+  int roMode = ncclParamIbPciRelaxedOrdering();
+  ncclResult_t r = ncclInternalError;
+  if (roMode == 1 || roMode == 2) {
+    // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+    r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0);
+  }
+  return r == ncclInternalError ? 0 : 1;
+}
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+  if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
-  if (ncclParamIbDisable()) return ncclInternalError;
 
   if (ncclNIbDevs == -1) {
     pthread_mutex_lock(&ncclIbLock);
     wrap_ibv_fork_init();
     if (ncclNIbDevs == -1) {
       ncclNIbDevs = 0;
-      if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+      if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
         WARN("NET/IB : No IP interface found.");
         return ncclInternalError;
       }
@@ -176,18 +207,27 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           }
           TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
               portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
           ncclIbDevs[ncclNIbDevs].device = d;
           ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
           ncclIbDevs[ncclNIbDevs].port = port;
           ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
           ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
           ncclIbDevs[ncclNIbDevs].context = context;
+          ncclIbDevs[ncclNIbDevs].pdRefs = 0;
+          ncclIbDevs[ncclNIbDevs].pd = NULL;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
           NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
           ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
+          ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
+          ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
+          ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+
+          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+          pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
+          ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
           ncclNIbDevs++;
           nPorts++;
-          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
           // [RCCL]
           pthread_detach(ncclIbAsyncThread);
           // [/RCCL]
@@ -201,13 +241,16 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
     } else {
       char line[1024];
       line[0] = '\0';
+      // Determine whether RELAXED_ORDERING is enabled and possible
+      ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
       for (int d=0; d<ncclNIbDevs; d++) {
         snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
             ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
       }
       line[1023] = '\0';
       char addrline[SOCKET_NAME_MAXLEN+1];
-      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr, addrline));
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
     }
     pthread_mutex_unlock(&ncclIbLock);
   }
@@ -239,11 +282,13 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
   return ncclSuccess;
 }
 
-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
   memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
   return ncclSuccess;
 }
 
+#define NCCL_NET_IB_MAX_RECVS 8
+
 ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   props->name = ncclIbDevs[dev].devName;
   props->pciPath = ncclIbDevs[dev].pciPath;
@@ -255,18 +300,23 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
     props->ptrSupport |= NCCL_PTR_CUDA;
   }
   props->speed = ncclIbDevs[dev].speed;
+  props->latency = 0; // Not set
   props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
   props->maxComms = ncclIbDevs[dev].maxQp;
+  props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
   return ncclSuccess;
 }
 
-#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
+// We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive
+#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
+static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
 
 #define NCCL_IB_MAX_QPS 128
 
 struct ncclIbQpInfo {
   uint32_t lid;
   uint8_t ib_port;
+  uint8_t link_layer;
   uint32_t qpn[NCCL_IB_MAX_QPS];
 
   // For RoCE
@@ -279,46 +329,83 @@ struct ncclIbQpInfo {
   uint64_t fifoAddr;
 };
 
-struct ncclIbHandle {
-  union socketAddress connectAddr;
+enum ncclIbCommState {
+  ncclIbCommStateStart = 0,
+  ncclIbCommStateConnect = 1,
+  ncclIbCommStateAccept = 3,
+  ncclIbCommStateSend = 4,
+  ncclIbCommStateRecv = 5,
+  ncclIbCommStateConnected = 6,
 };
 
+struct ncclIbCommStage {
+  enum ncclIbCommState state;
+  int offset;
+  void* buffer;
+  void* comm;
+};
+
+struct ncclIbHandle {
+  union ncclSocketAddress connectAddr; // Filled by the target
+  struct ncclIbCommStage stage; // Used by the other side when connecting
+};
+
+#define NCCL_NET_IB_REQ_UNUSED 0
+#define NCCL_NET_IB_REQ_SEND 1
+#define NCCL_NET_IB_REQ_RECV 2
+#define NCCL_NET_IB_REQ_FLUSH 3
+
 struct ncclIbRequest {
-  int used;
-  int type;
   struct ncclIbVerbs* verbs;
+  int type;
   int events;
-  int size;
-  union socketAddress *addr;
+  union ncclSocketAddress *addr;
+  int nreqs;
+  union {
+    struct {
+      int size;
+      void* data;
+      uint32_t lkey;
+      int offset;
+    } send;
+    struct {
+      int sizes[NCCL_NET_IB_MAX_RECVS];
+    } recv;
+  };
 };
 
 struct ncclIbVerbs {
-  struct ibv_pd* pd;
+  int dev;
+  struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd
   struct ibv_cq* cq;
-  uint64_t pad[2];
+  uint64_t pad[1];
   struct ncclIbRequest reqs[MAX_REQUESTS];
 };
 
 struct ncclIbListenComm {
   int dev;
-  int fd;
+  struct ncclSocket sock;
+  struct ncclIbCommStage stage;
 };
 
 struct alignas(64) ncclIbSendFifo {
   uint64_t addr;
   int      size;
-  uint32_t seq;
   uint32_t rkey;
-  uint32_t ready;
-  uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
+  uint32_t nreqs;
+  uint32_t tag;
+  uint64_t idx;
 };
 
 struct ncclIbSendComm {
   struct ncclIbVerbs verbs;
-  struct ncclIbSendFifo fifo[MAX_REQUESTS];
-  uint32_t fifoHead;
-  int fd;
-  union socketAddress addr;
+  struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  uint64_t fifoHead;
+  struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
+  struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
+  struct ncclSocket sock;
+
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
@@ -339,10 +426,10 @@ struct ncclIbGpuFlush {
 };
 
 struct ncclIbRemFifo {
-  struct ncclIbSendFifo elems[MAX_REQUESTS];
+  struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  uint64_t fifoTail;
   uint64_t addr;
   uint32_t rkey;
-  uint32_t tail;
   uint32_t flags;
   struct ibv_mr* mr;
   struct ibv_sge sge;
@@ -351,8 +438,7 @@ struct ncclIbRemFifo {
 struct ncclIbRecvComm {
   struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
-  int fd;
-  union socketAddress addr;
+  struct ncclSocket sock;
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
@@ -362,17 +448,39 @@ static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendC
 
 NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
 
-ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
-  NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
+ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) {
+  verbs->dev = dev;
+
+  pthread_mutex_lock(&ncclIbDevs[dev].lock);
+  if (0 == ncclIbDevs[dev].pdRefs++) {
+    ncclResult_t res;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure);
+    if (0) {
+    failure:
+      pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+      return res;
+    }
+  }
+  verbs->pd = ncclIbDevs[dev].pd;
+  pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+
   // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
   NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+  ncclResult_t res;
   NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
-  NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd));
-  return ncclSuccess;
+
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  if (0 == --ncclIbDevs[verbs->dev].pdRefs) {
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning);
+  }
+  res = ncclSuccess;
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
 }
 
 ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
@@ -398,7 +506,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -407,7 +515,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   qpAttr.rq_psn = 0;
   qpAttr.max_dest_rd_atomic = 1;
   qpAttr.min_rnr_timer = 12;
-  if (info->lid == 0) {
+  if (info->link_layer == IBV_LINK_LAYER_ETHERNET) {
     qpAttr.ah_attr.is_global = 1;
     qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
     qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
@@ -426,7 +534,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
+ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTS;
@@ -439,33 +547,56 @@ ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
   return ncclSuccess;
 }
 
-
 ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   struct ncclIbListenComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
+  memset(handle, 0, sizeof(struct ncclIbHandle));
   comm->dev = dev;
-  NCCLCHECK(GetSocketAddr(&(handle->connectAddr)));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(GetSocketAddr(&comm->sock.addr));
+  NCCLCHECK(ncclSocketListen(&comm->sock));
+  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
   *listenComm = comm;
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
-  struct ncclIbSendComm* comm;
-  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
-  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
-  *sendComm = comm;
+  enum ncclSocketState conState;
+  struct ncclIbCommStage* stage = &handle->stage;
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
+  *sendComm = NULL;
 
-  comm->addr = handle->connectAddr;
+  if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
+  if (stage->state == ncclIbCommStateSend) goto ib_send;
+  if (stage->state != ncclIbCommStateStart) {
+    WARN("Error: trying to connect already connected sendComm");
+    return ncclInternalError;
+  }
+
+  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
+  NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
+  stage->comm = comm;
+  stage->state = ncclIbCommStateConnect;
+  NCCLCHECK(ncclSocketConnect(&comm->sock));
+
+ib_connect_check:
+  /* since ncclSocketConnect is async, we must check if connection is complete */
+  NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
+  if (conState == ncclSocketConnecting) {
+    /* expect user to call again */
+    return ncclSuccess;
+  } else if (conState == ncclSocketError) {
+    return ncclSystemError;
+  }
 
   // IB Setup
-  ibv_context* ctx = ncclIbDevs[dev].context;
-  NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
-  uint8_t ib_port = ncclIbDevs[dev].port;
+  struct ibv_context* ctx;
+  ctx = ncclIbDevs[dev].context;
+  NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs));
+  uint8_t ib_port;
+  ib_port = ncclIbDevs[dev].port;
   comm->nqps = ncclParamIbQpsPerConn();
   for (int q=0; q<comm->nqps; q++) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
@@ -480,13 +611,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   qpInfo.mtu = portAttr.active_mtu;
 
   // Prepare my fifo
-  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
   qpInfo.fifoRkey = comm->fifoMr->rkey;
   qpInfo.fifoAddr = (uint64_t)comm->fifo;
 
   // RoCE support
   qpInfo.lid = portAttr.lid;
-  if (qpInfo.lid) { // IB
+  qpInfo.link_layer = portAttr.link_layer;
+  if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
     for (int q=0; q<comm->nqps; q++)
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
@@ -498,7 +630,19 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
-  NCCLCHECK(socketSend(comm->fd, &comm->addr, &qpInfo, sizeof(qpInfo)));
+  stage->state = ncclIbCommStateSend;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo)));
+  memcpy(stage->buffer, &qpInfo, sizeof(qpInfo));
+
+ib_send:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
+  if (stage->offset != sizeof(qpInfo))
+    return ncclSuccess;
+
+  free(stage->buffer);
+  stage->state = ncclIbCommStateConnected;
+  *sendComm = comm;
   return ncclSuccess;
 }
 
@@ -506,24 +650,53 @@ NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
 ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
-  struct ncclIbRecvComm* rComm;
-  NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+  struct ncclIbCommStage* stage = &lComm->stage;
+  struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
+  *recvComm = NULL;
+
+  if (stage->state == ncclIbCommStateAccept) goto ib_accept;
+  if (stage->state == ncclIbCommStateRecv) goto ib_recv;
+  if (stage->state == ncclIbCommStateSend) goto ib_send;
+  if (stage->state != ncclIbCommStateStart) {
+    WARN("Listencomm in unknown state %d\n", stage->state);
+    return ncclInternalError;
+  }
+
+  NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+  stage->comm = rComm;
+  stage->state = ncclIbCommStateAccept;
+  lComm->sock.asyncFlag = 1;
+  rComm->sock.asyncFlag = 1;
+
+ib_accept:
+  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
+  if (rComm->sock.fd == -1)
+    return ncclSuccess;
 
-  socklen_t socklen = sizeof(union socketAddress);
-  SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", rComm->fd);
   struct ncclIbQpInfo remQpInfo;
-  NCCLCHECK(socketRecv(rComm->fd, &rComm->addr, &remQpInfo, sizeof(remQpInfo)));
+  stage->state = ncclIbCommStateRecv;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
+ib_recv:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
+  if (stage->offset != sizeof(remQpInfo))
+    return ncclSuccess;
+
+  /* copy back the received info */
+  memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
 
   // IB setup
-  ibv_context* ctx = ncclIbDevs[lComm->dev].context;
-  uint8_t ib_port = ncclIbDevs[lComm->dev].port;
+  struct ibv_context* ctx;
+  uint8_t ib_port;
+  ctx = ncclIbDevs[lComm->dev].context;
+  ib_port = ncclIbDevs[lComm->dev].port;
   struct ibv_port_attr portAttr;
   NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
   union ibv_gid gid;
   NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
 
   // QP Creation
-  NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs));
+  NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
   rComm->nqps = ncclParamIbQpsPerConn();
   for (int q=0; q<rComm->nqps; q++) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q));
@@ -542,8 +715,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   // Retain remote fifo info and prepare my RDMA ops
   rComm->remFifo.rkey = remQpInfo.fifoRkey;
   rComm->remFifo.addr = remQpInfo.fifoAddr;
-  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
-  rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
+  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
   rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
   if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
 
@@ -557,6 +729,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
     struct ncclIbQpInfo localQpInfo;
     localQpInfo.lid=portAttr.lid;
+    localQpInfo.link_layer=portAttr.link_layer;
     localQpInfo.ib_port=ib_port;
     localQpInfo.spn=gid.global.subnet_prefix;
     localQpInfo.iid=gid.global.interface_id;
@@ -568,26 +741,39 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   // Fill Handle
   struct ncclIbQpInfo qpInfo;
   qpInfo.lid=portAttr.lid;
+  qpInfo.link_layer=portAttr.link_layer;
   qpInfo.ib_port=ib_port;
   for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
   qpInfo.spn=gid.global.subnet_prefix;
   qpInfo.iid=gid.global.interface_id;
   qpInfo.mtu=remQpInfo.mtu;
 
-  NCCLCHECK(socketSend(rComm->fd, &rComm->addr, &qpInfo, sizeof(qpInfo)));
+  stage->state = ncclIbCommStateSend;
+  stage->offset = 0;
+  if (stage->buffer) free(stage->buffer);
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
+  memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
+ib_send:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
+  if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
+
+  free(stage->buffer);
   *recvComm = rComm;
+
+  /* reset lComm stage */
+  stage->state = ncclIbCommStateStart;
+  stage->offset = 0;
+  stage->comm = NULL;
+  stage->buffer = NULL;
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
   for (int i=0; i<MAX_REQUESTS; i++) {
     struct ncclIbRequest* r = verbs->reqs+i;
-    if (r->used == 0) {
-      r->used = 1;
-      r->type = 0;
+    if (r->type == NCCL_NET_IB_REQ_UNUSED) {
       r->verbs = verbs;
       r->events = 1;
-      r->size = -1;
       r->addr = NULL;
       *req = r;
       return ncclSuccess;
@@ -598,7 +784,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
   return ncclInternalError;
 }
 ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
-  r->used = 0;
+  r->type = NCCL_NET_IB_REQ_UNUSED;
   return ncclSuccess;
 }
 
@@ -607,9 +793,9 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
 
   // Do not block on this receive, return if not ready.
   int bytes = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
   if (bytes == 0) return ncclSuccess; // Try again later
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
 
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
@@ -618,7 +804,7 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
   }
   comm->ready = 1;
   // Block until this is done. It *should* not block indefinitely.
-  NCCLCHECK(socketSend(comm->fd, &comm->addr, &comm->ready, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
 
   return ncclSuccess;
 }
@@ -626,39 +812,172 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
 ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
   // Do not block on this receive, return if not ready.
   int bytes = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
   if (bytes == 0) return ncclSuccess; // Try again later
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
-#define REG_ALIGN (4096)
-
 ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
   static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
-  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
-  uint64_t addr = (uint64_t)data;
   assert(size > 0);
 
-  // Deregister / register
-  uint64_t regAddr = addr & (~(REG_ALIGN-1));
-  uint64_t regSize = addr+size - regAddr;
-  regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
-  struct ibv_mr* mr;
-  NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  *mhandle = (void*)mr;
-  TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
-  return ncclSuccess;
+  static __thread uintptr_t pageSize = 0;
+  if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE);
+
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  ncclResult_t res;
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population) { // didn't find in cache
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning);
+      }
+      // Deregister / register
+      struct ibv_mr* mr;
+      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
+      if (ncclIbRelaxedOrderingEnabled) {
+        // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+        NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+      }
+      else {
+        NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+      }
+      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
+      cache->population += 1;
+      cache->slots[slot].addr = addr;
+      cache->slots[slot].pages = pages;
+      cache->slots[slot].refs = 1;
+      cache->slots[slot].mr = mr;
+      *mhandle = (void*)mr;
+      res = ncclSuccess;
+      goto returning;
+    }
+    else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) {
+      cache->slots[slot].refs += 1;
+      *mhandle = (void*)cache->slots[slot].mr;
+      res = ncclSuccess;
+      goto returning;
+    }
+  }
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
 }
 
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
-  NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+  ncclResult_t res;
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  for (int i=0; i < cache->population; i++) {
+    if (mhandle == cache->slots[i].mr) {
+      if (0 == --cache->slots[i].refs) {
+        memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr));
+        if (cache->population == 0) {
+          free(cache->slots);
+          cache->slots = NULL;
+          cache->capacity = 0;
+        }
+        NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning);
+      }
+      res = ncclSuccess;
+      goto returning;
+    }
+  }
+  WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population);
+  res = ncclInternalError;
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
+}
+
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+  struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+  volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
+  int nreqs = slots[0].nreqs;
+  if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+
+  uint64_t wr_id = 0ULL;
+
+  for (int r=0; r<nreqs; r++) {
+    struct ibv_send_wr* wr = comm->wrs+r;
+    memset(wr, 0, sizeof(struct ibv_send_wr));
+
+    struct ibv_sge* sge = comm->sges+r;
+    sge->addr=(uintptr_t)reqs[r]->send.data;
+    sge->lkey=reqs[r]->send.lkey;
+
+    wr->opcode = IBV_WR_RDMA_WRITE;
+    wr->send_flags = 0;
+    wr->wr.rdma.remote_addr = slots[r].addr;
+    wr->wr.rdma.rkey = slots[r].rkey;
+    wr->next = wr+1;
+    wr_id += (reqs[r] - comm->verbs.reqs) << (r*8);
+  }
+
+  // Write size as immediate data. In the case of multi-send, only write
+  // 0 or 1 as size to indicate whether there was data sent or received.
+  uint64_t immData = 0;
+  if (nreqs == 1) {
+    immData = reqs[0]->send.size;
+  } else {
+    uint8_t* multiImmData = (uint8_t*)&immData;
+    for (int r=0; r<nreqs; r++) {
+      multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+    }
+  }
+
+  struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
+  if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
+    // When using adaptive routing, send the bulk of the data first as an
+    // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+    // completion.
+    lastWr++;
+    memset(lastWr, 0, sizeof(struct ibv_send_wr));
+  }
+  lastWr->wr_id = wr_id;
+  lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  lastWr->imm_data = immData;
+  lastWr->next = NULL;
+  lastWr->send_flags = IBV_SEND_SIGNALED;
+
+  // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
+  const int align = 128;
+  for (int q=0; q<comm->nqps; q++) {
+    for (int r=0; r<nreqs; r++) {
+      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
+      int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
+      if (length <= 0) {
+        comm->wrs[r].sg_list = NULL;
+        comm->wrs[r].num_sge = 0;
+      } else {
+        comm->sges[r].length = length;
+        comm->wrs[r].sg_list = comm->sges+r;
+        comm->wrs[r].num_sge = 1;
+      }
+    }
+    struct ibv_send_wr* bad_wr;
+    NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
+
+    for (int r=0; r<nreqs; r++) {
+      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
+      reqs[r]->send.offset += chunkSize;
+      comm->sges[r].addr += chunkSize;
+      comm->wrs[r].wr.rdma.remote_addr += chunkSize;
+    }
+  }
+
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
@@ -666,108 +985,89 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
   struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   // Wait for the receiver to have posted the corresponding receive
-  volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
-  volatile uint32_t * readyPtr = &slot->ready;
-  if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
+  int nreqs = 0;
+  volatile struct ncclIbSendFifo* slots;
 
-  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->size = size;
-  req->addr = &comm->addr;
+  int slot = (comm->fifoHead)%MAX_REQUESTS;
+  struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+  slots = comm->fifo[slot];
+  int idx = comm->fifoHead+1;
+  if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+  nreqs = slots[0].nreqs;
+  // Wait until all data has arrived
+  for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+  __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+  for (int r=0; r<nreqs; r++) {
+    if (reqs[r] != NULL || slots[r].tag != tag) continue;
 
-  struct ibv_send_wr wr[2];
-  memset(&wr[0], 0, sizeof(wr[0]));
-  wr[0].wr_id = (uint64_t)req;
-
-  struct ibv_sge sge;
-  sge.addr=(uintptr_t)data; sge.lkey=mr->lkey;
-
-#if USE_RDMA_WRITE == 0
-  wr[0].opcode = IBV_WR_SEND;
-  wr[0].send_flags = IBV_SEND_SIGNALED;
-#else
-  __sync_synchronize(); // order the readyPtr load against rkey load below
-  // Sanity checks to catch user collective call count/size mismatches
-  // plus any potential programming errors
-  if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
-         socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
-    return ncclInternalError;
-  }
-  wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr[0].send_flags = IBV_SEND_SIGNALED;
-  wr[0].wr.rdma.remote_addr = slot->addr;
-  wr[0].wr.rdma.rkey = slot->rkey;
-  wr[0].imm_data = size; // Send the message size via imm_data
-  __sync_synchronize();
-#endif
-  // We must clear slot->ready, but reset other fields to aid
-  // debugging and sanity checks
-  slot->ready = 0;
-  slot->addr = 0ULL;
-  slot->rkey = slot->size = slot->seq = 0;
-  comm->fifoHead++;
-
-
-#if USE_RDMA_WRITE
-  // When using adaptive routing, send the bulk of the data first as an
-  // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
-  // completion.
-  if (size > ncclParamIbArThreshold()) {
-    memset(&wr[1], 0, sizeof(wr[1]));
-    memcpy(&wr[1], &wr[0], sizeof(wr[0]));
-    wr[1].sg_list = NULL;
-    wr[1].num_sge = 0;
-    wr[0].next = &wr[1];
-
-    wr[0].opcode = IBV_WR_RDMA_WRITE;
-    wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-
-    wr[0].send_flags = 0;
-    wr[1].send_flags = IBV_SEND_SIGNALED;
-  }
-#endif
-
-  int chunkSize = std::max(8, DIVUP(size, comm->nqps));
-
-  int offset = 0;
-  for (int q=0; q<comm->nqps; q++) {
-    int length = std::min(size-offset, chunkSize);
-    if (length <= 0) {
-      wr[0].sg_list = NULL;
-      wr[0].num_sge = 0;
-    } else {
-      sge.length = length;
-      wr[0].sg_list = &sge;
-      wr[0].num_sge = 1;
+    // Sanity checks to catch user collective call count/size mismatches
+    if (size > slots[r].size) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
+           r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size);
+      return ncclInvalidUsage;
+    } // plus any potential programming errors
+    else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
+     char line[SOCKET_NAME_MAXLEN+1];
+     WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
+          r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
+      return ncclInternalError;
     }
-    struct ibv_send_wr* bad_wr;
-    NCCLCHECK(wrap_ibv_post_send(comm->qps[q], wr, &bad_wr));
-    offset += chunkSize;
-    sge.addr += chunkSize;
-    wr[0].wr.rdma.remote_addr += chunkSize;
-  }
-  req->events = comm->nqps;
+    struct ncclIbRequest* req;
+    NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+    req->type = NCCL_NET_IB_REQ_SEND;
+    req->addr = &comm->sock.addr;
+    req->verbs = &comm->verbs;
+    req->nreqs = nreqs;
+    req->send.size = size;
+    req->send.data = data;
+    req->send.lkey = mr->lkey;
+    req->send.offset = 0;
+    req->addr = &comm->sock.addr;
+    req->events = comm->nqps;
+    *request = reqs[r] = req;
 
-  *request = req;
+    // If this is a multi-recv, send only when all requests have matched.
+    for (int r=0; r<nreqs; r++) {
+      if (reqs[r] == NULL) return ncclSuccess;
+    }
+
+    TIME_START(0);
+    NCCLCHECK(ncclIbMultiSend(comm, slot));
+
+    // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
+    memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+    memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
+    comm->fifoHead++;
+    TIME_STOP(0);
+    return ncclSuccess;
+  }
+
+  *request = NULL;
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
 
-  int slot = comm->remFifo.tail%MAX_REQUESTS;
-  struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
-  localElem->addr = addr;
-  localElem->rkey = rkey;
-  localElem->ready = 1;
-  localElem->size = size; // Sanity/Debugging
-  localElem->seq = comm->remFifo.tail; // Sanity/Debugging
-  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
+  int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
+
+  for (int i=0; i<n; i++) {
+    localElem[i].addr = (uint64_t)data[i];
+    struct ibv_mr* mr = (struct ibv_mr*)mhandles[i];
+    localElem[i].rkey = mr->rkey;
+    localElem[i].nreqs = n;
+    localElem[i].size = sizes[i]; // Sanity/Debugging
+    localElem[i].tag = tags[i];
+    localElem[i].idx = comm->remFifo.fifoTail+1;
+  }
+
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
   wr.wr.rdma.rkey = comm->remFifo.rkey;
   comm->remFifo.sge.addr = (uint64_t)localElem;
+  comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo);
   wr.sg_list = &comm->remFifo.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE;
@@ -796,92 +1096,107 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
   //
   if (slot == 0) {
     wr.send_flags |= IBV_SEND_SIGNALED;
-    wr.wr_id = (uint64_t)req;
+    wr.wr_id = req - comm->verbs.reqs;
     req->events++;
   }
 
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr));
-  comm->remFifo.tail++;
+  comm->remFifo.fifoTail++;
 
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
-
-  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+  if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->size = size;
-  req->addr = &comm->addr;
+  req->type = NCCL_NET_IB_REQ_RECV;
+  req->addr = &comm->sock.addr;
+  req->nreqs = n;
+  for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
 
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)req;
+  wr.wr_id = req - comm->verbs.reqs;
 
   wr.sg_list = NULL;
   wr.num_sge = 0;
 
+  TIME_START(1);
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
     struct ibv_recv_wr* bad_wr;
     NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
   }
+  TIME_STOP(1);
   req->events = comm->nqps;
 
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
+  TIME_START(2);
+  NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
+  TIME_STOP(2);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
-  if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
+  int last = -1;
+  for (int i=0; i<n; i++) if (sizes[i]) last = i;
+  if (comm->gpuFlush.enabled == 0 || last == -1) return ncclSuccess;
 
+  // Only flush once using the last non-zero receive
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->addr = &comm->addr;
-  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+  req->type = NCCL_NET_IB_REQ_FLUSH;
+  req->addr = &comm->sock.addr;
+  struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)req;
+  wr.wr_id = req - comm->verbs.reqs;
 
-  wr.wr.rdma.remote_addr = (uint64_t)data;
+  wr.wr.rdma.remote_addr = (uint64_t)data[last];
   wr.wr.rdma.rkey = mr->rkey;
   wr.sg_list = &comm->gpuFlush.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_READ;
   wr.send_flags = IBV_SEND_SIGNALED;
 
+  TIME_START(4);
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
+  TIME_STOP(4);
 
   *request = req;
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbTest(void* request, int* done, int* size) {
+ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
 
   while (1) {
     if (r->events == 0) {
       *done = 1;
-      if (size) *size = r->size;
+      if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
+        for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+      }
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
 
     int wrDone = 0;
     struct ibv_wc wcs[4];
+    TIME_START(3);
     NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
+    if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
     if (wrDone == 0) return ncclSuccess;
 
     for (int w=0; w<wrDone; w++) {
@@ -889,20 +1204,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
       if (wc->status != IBV_WC_SUCCESS) {
         char line[SOCKET_NAME_MAXLEN+1];
         WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
-             socketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+             ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
         return ncclSystemError;
       }
 
-      struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
-      if (doneReq) {
-        if (wc->opcode == IBV_WC_RECV) {
-          doneReq->size = wc->byte_len;
-#if USE_RDMA_WRITE
-        } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-          doneReq->size = wc->imm_data;
-#endif
+      struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
+      if (req->type == NCCL_NET_IB_REQ_SEND) {
+        for (int i=0; i<req->nreqs; i++) {
+          struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff);
+          if ((sendReq->events <= 0)) return ncclInternalError;
+          sendReq->events--;
         }
-        doneReq->events--;
+      } else {
+        if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+          if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
+          if (req->nreqs > 1) {
+            // In the case of a multi recv, we only set sizes to 0 or 1.
+            uint8_t* sizes = (uint8_t*)&wc->imm_data;
+            for (int i=0; i<req->nreqs; i++) {
+              req->recv.sizes[i] |= sizes[i];
+            }
+          } else {
+            req->recv.sizes[0] += wc->imm_data;
+          }
+        }
+        req->events--;
       }
     }
   }
@@ -911,20 +1237,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
 ncclResult_t ncclIbCloseSend(void* sendComm) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     for (int q=0; q<comm->nqps; q++)
       if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
     if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
+  TIME_PRINT("IB");
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbCloseRecv(void* recvComm) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     for (int q=0; q<comm->nqps; q++)
       if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
     if (comm->gpuFlush.enabled) {
@@ -941,7 +1268,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
 ncclResult_t ncclIbCloseListen(void* listenComm) {
   struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     free(comm);
   }
   return ncclSuccess;
@@ -965,3 +1292,4 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseRecv,
   ncclIbCloseListen
 };
+
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index c045a8f91d..a8f69aa5f7 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,7 +19,7 @@
 /* Init functions */
 static int ncclNetIfs = -1;
 struct ncclSocketDev {
-  union socketAddress addr;
+  union ncclSocketAddress addr;
   char devName[MAX_IF_NAME_SIZE];
   char* pciPath;
 };
@@ -40,8 +40,8 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
     pthread_mutex_lock(&ncclSocketLock);
     if (ncclNetIfs == -1) {
       char names[MAX_IF_NAME_SIZE*MAX_IFS];
-      union socketAddress addrs[MAX_IFS];
-      ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      union ncclSocketAddress addrs[MAX_IFS];
+      ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
@@ -53,10 +53,10 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
         addrline[SOCKET_NAME_MAXLEN] = '\0';
         for (int i=0; i<ncclNetIfs; i++) {
           strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
-          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
           NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
           snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
-              socketToString(&addrs[i], addrline));
+              ncclSocketToString(&addrs[i], addrline));
         }
         line[MAX_LINE_LEN] = '\0';
         INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -97,12 +97,14 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->guid = dev;
   props->ptrSupport = NCCL_PTR_HOST;
   NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+  props->latency = 0; // Not set
   props->port = 0;
   props->maxComms = 65536;
+  props->maxRecvs = 1;
   return ncclSuccess;
 }
 
-ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
   if (dev >= ncclNetIfs) return ncclInternalError;
   memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
   return ncclSuccess;
@@ -118,18 +120,33 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
 NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
 
+enum ncclSocketCommState {
+  ncclSocketCommStateStart = 0,
+  ncclSocketCommStateConnect = 1,
+  ncclSocketCommStateAccept = 3,
+  ncclSocketCommStateSend = 4,
+  ncclSocketCommStateRecv = 5,
+};
+
+struct ncclSocketCommStage {
+  enum ncclSocketCommState state;
+  uint8_t iteration;
+  struct ncclSocket* sock;
+  struct ncclSocketComm* comm;
+};
+
 struct ncclSocketHandle {
-  union socketAddress connectAddr;
+  union ncclSocketAddress connectAddr;
   int nSocks;
   int nThreads;
+  struct ncclSocketCommStage stage;
 };
 
 struct ncclSocketTask {
   int op;
   void* data;
   int size;
-  int fd;
-  union socketAddress *addr;
+  struct ncclSocket* sock;
   int offset;
   int used;
   ncclResult_t result;
@@ -139,8 +156,7 @@ struct ncclSocketRequest {
   int op;
   void* data;
   int size;
-  int ctrlFd;
-  union socketAddress *addr;
+  struct ncclSocket* ctrlSock;
   int offset;
   int used;
   struct ncclSocketComm* comm;
@@ -154,29 +170,30 @@ struct ncclSocketTaskQueue {
   struct ncclSocketTask* tasks;
 };
 
-enum threadState {start, stop};
-
 struct ncclSocketThreadResources {
   struct ncclSocketTaskQueue threadTaskQueue;
-  enum threadState state;
+  int stop;
   struct ncclSocketComm* comm;
   pthread_mutex_t threadLock;
   pthread_cond_t  threadCond;
 };
 
 struct ncclSocketListenComm {
-  int fd;
+  struct ncclSocket sock;
+  struct ncclSocketCommStage stage;
   int nSocks;
   int nThreads;
+  int dev;
 };
 
 struct ncclSocketComm {
-  int ctrlFd;
-  union socketAddress addr;
-  int fds[MAX_SOCKETS];
+  struct ncclSocket ctrlSock;
+  struct ncclSocket socks[MAX_SOCKETS];
+  int dev;
+  int cudaDev;
   int nSocks;
   int nThreads;
-  int nextFd;
+  int nextSock;
   struct ncclSocketRequest requests[MAX_REQUESTS];
   pthread_t helperThread[MAX_THREADS];
   struct ncclSocketThreadResources threadResources[MAX_THREADS];
@@ -185,7 +202,6 @@ struct ncclSocketComm {
 void* persistentSocketThread(void *args_) {
   struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
   struct ncclSocketComm* comm = resource->comm;
-  volatile enum threadState* state = &resource->state;
   struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
   int nSocksPerThread = comm->nSocks / comm->nThreads;
   while (1) {
@@ -198,7 +214,7 @@ void* persistentSocketThread(void *args_) {
         for (int j=0; j<nSocksPerThread; j++) {
           struct ncclSocketTask* r = myQueue->tasks+i+j;
           if (r != NULL && r->used == 1 && r->offset < r->size) {
-            r->result = socketProgress(r->op, r->fd, r->addr, r->data, r->size, &r->offset);
+            r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
               WARN("NET/Socket : socket progress error");
               return NULL;
@@ -211,12 +227,12 @@ void* persistentSocketThread(void *args_) {
     }
     if (idle) {
       pthread_mutex_lock(&resource->threadLock);
-      while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+      while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait
         pthread_cond_wait(&resource->threadCond, &resource->threadLock);
       }
       pthread_mutex_unlock(&resource->threadLock);
     }
-    if (*state == stop) return NULL;
+    if (resource->stop) return NULL;
   }
 }
 
@@ -271,17 +287,17 @@ end:
 
 ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
   NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
+  (*comm)->sock.fd = -1;
   return ncclSuccess;
 }
 
 ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
   NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->ctrlFd = -1;
+  (*comm)->ctrlSock.fd = -1;
   for (int i=0; i < MAX_SOCKETS; i++) {
-    (*comm)->fds[i] = -1;
+    (*comm)->socks[i].fd = -1;
   }
-  (*comm)->nextFd = 0;
+  (*comm)->nextSock = 0;
   return ncclSuccess;
 }
 
@@ -290,14 +306,18 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
     return ncclInternalError;
   }
   struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  memset(handle, 0, sizeof(struct ncclSocketHandle));
+  static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
   struct ncclSocketListenComm* comm;
   NCCLCHECK(ncclSocketNewListenComm(&comm));
-  NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+  NCCLCHECK(ncclSocketListen(&comm->sock));
+  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
   handle->nSocks = comm->nSocks;
   handle->nThreads = comm->nThreads;
+  comm->sock.asyncFlag = 1;
+  comm->dev = dev;
   *listenComm = comm;
   return ncclSuccess;
 }
@@ -306,38 +326,99 @@ ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
   if (dev < 0) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
+
+  enum ncclSocketState conState;
   struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  struct ncclSocketCommStage* stage = &handle->stage;
+  struct ncclSocketComm* comm = stage->comm;
+  uint8_t i = stage->iteration;
+  struct ncclSocket* sock = stage->sock;
+  *sendComm = NULL;
+
+  if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
+  if (stage->state == ncclSocketCommStateSend) goto socket_send;
+
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  stage->comm = comm;
   comm->nSocks = handle->nSocks;
   comm->nThreads = handle->nThreads;
-  for (int i=0; i<comm->nSocks+1; i++) {
-    int tmpFd, offset=0;
-    NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
-    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &handle->connectAddr, &i, sizeof(int), &offset));
-    if (i == comm->nSocks) comm->ctrlFd = tmpFd;
-    else comm->fds[i] = tmpFd;
+  comm->dev = dev;
+  CUDACHECK(hipGetDevice(&comm->cudaDev));
+  for (; i<comm->nSocks+1; i++) {
+    sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
+    NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
+
+    stage->sock = sock;
+    stage->state = ncclSocketCommStateConnect;
+    stage->iteration = i;
+    NCCLCHECK(ncclSocketConnect(sock));
+
+socket_connect_check:
+    NCCLCHECK(ncclGetSocketState(sock, &conState));
+    if (conState == ncclSocketConnecting) {
+      /* expect user to call again */
+      return ncclSuccess;
+    } else if (conState == ncclSocketError) {
+      return ncclSystemError;
+    }
+    stage->state = ncclSocketCommStateSend;
+
+socket_send:
+    int done = 0;
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done));
+    if (done == 0) return ncclSuccess;
   }
   *sendComm = comm;
-  comm->addr = handle->connectAddr;
   return ncclSuccess;
 }
 
 ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
   struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
-  struct ncclSocketComm* rComm;
+  struct ncclSocketCommStage* stage = &lComm->stage;
+  struct ncclSocketComm* rComm = stage->comm;
+  uint8_t i = stage->iteration;
+  struct ncclSocket* sock = stage->sock;
+
+  *recvComm = NULL;
+  if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
+  if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
+
   NCCLCHECK(ncclSocketNewComm(&rComm));
+  stage->comm = rComm;
   rComm->nSocks = lComm->nSocks;
   rComm->nThreads = lComm->nThreads;
-  for (int i=0; i<rComm->nSocks+1; i++) {
-    int tmpFd, sendSockIdx, offset=0;
-    socklen_t socklen = sizeof(union socketAddress);
-    SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", tmpFd);
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &rComm->addr, &sendSockIdx, sizeof(int), &offset));
-    if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
-    else rComm->fds[sendSockIdx] = tmpFd;
+  rComm->dev = lComm->dev;
+  CUDACHECK(hipGetDevice(&rComm->cudaDev));
+  lComm->sock.asyncFlag = 1;
+  for (; i<rComm->nSocks+1; i++) {
+    uint8_t sendSockIdx;
+    ncclCalloc(&sock, 1);
+    NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1));
+    stage->sock = sock;
+    stage->state = ncclSocketCommStateAccept;
+    stage->iteration = i;
+socket_accept:
+    NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
+    if (sock->fd == -1) return ncclSuccess;
+
+    stage->state = ncclSocketCommStateRecv;
+socket_recv:
+    int done = 0;
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
+    if (done == 0) return ncclSuccess;
+
+    if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
+    else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
+
+    free(sock);
   }
   *recvComm = rComm;
+
+  /* reset lComm state */
+  stage->state = ncclSocketCommStateStart;
+  stage->iteration = 0;
+  stage->sock = NULL;
+  stage->comm = NULL;
   return ncclSuccess;
 }
 
@@ -348,8 +429,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
       r->op = op;
       r->data = data;
       r->size = size;
-      r->ctrlFd = comm->ctrlFd;
-      r->addr = &comm->addr;
+      r->ctrlSock = &comm->ctrlSock;
       r->used = 1;
       r->comm = comm;
       r->nSubs = 0;
@@ -362,7 +442,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
 }
 
 ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
-  int tid = comm->nextFd % comm->nThreads;
+  int tid = comm->nextSock % comm->nThreads;
   struct ncclSocketThreadResources* res = comm->threadResources+tid;
   struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
   // create helper threads and prepare per-thread task queue
@@ -377,22 +457,21 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
     pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+    ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
   }
   struct ncclSocketTask* r = queue->tasks+queue->next;
   if (r->used == 0) {
     r->op = op;
     r->data = data;
     r->size = size;
-    r->fd = comm->fds[comm->nextFd];
-    r->addr = &comm->addr;
+    r->sock = comm->socks+comm->nextSock;
     r->offset = 0;
     r->result = ncclSuccess;
-    comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+    comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
     r->used = 1;
     *req = r;
     pthread_mutex_lock(&res->threadLock);
     queue->next = (queue->next+1)%queue->len;
-    res->state = start;
     pthread_cond_signal(&res->threadCond);
     pthread_mutex_unlock(&res->threadLock);
     return ncclSuccess;
@@ -411,18 +490,20 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
   if (r->used == 1) { /* try to send/recv size */
     int data = r->size;
     int offset = 0;
-    NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+    NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset));
 
     if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
 
     // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+    if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset));
 
     // Check size is less or equal to the size provided by the user
     if (r->op == NCCL_SOCKET_RECV && data > r->size) {
       char line[SOCKET_NAME_MAXLEN+1];
-      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
-      return ncclInternalError;
+      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
+          there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
+          ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
+      return ncclInvalidUsage;
     }
     r->size = data;
     r->offset = 0;
@@ -459,7 +540,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
       }
     } else { // progress request using main thread
       if (r->offset < r->size) {
-        NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, r->data, r->size, &r->offset));
+        NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
       }
       if (r->offset == r->size) {
         if (size) *size = r->size;
@@ -476,19 +557,20 @@ ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void**
 }
 ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
   NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
-  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+  if (n != 1) return ncclInternalError;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
   // We don't support CUDA pointers, so we don't need a flush operation
   return ncclInternalError;
 }
@@ -496,7 +578,7 @@ ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandl
 ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
   struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
   if (comm) {
-    if (comm->fd != -1) close(comm->fd);
+    if (comm->sock.fd != -1) close(comm->sock.fd);
     free(comm);
   }
   return ncclSuccess;
@@ -509,16 +591,16 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
       struct ncclSocketThreadResources* res = comm->threadResources+i;
       if (comm->helperThread[i]) {
         pthread_mutex_lock(&res->threadLock);
-        res->state = stop;
+        res->stop = 1;
         pthread_cond_signal(&res->threadCond);
         pthread_mutex_unlock(&res->threadLock);
         pthread_join(comm->helperThread[i], NULL);
       }
       free(res->threadTaskQueue.tasks);
     }
-    if (comm->ctrlFd != -1) close(comm->ctrlFd);
+    if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
     for (int i=0; i<comm->nSocks; i++) {
-      if (comm->fds[i] != -1) close(comm->fds[i]);
+      if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
     }
     free(comm);
   }
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index d98e18c8bc..c6513c5c1d 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,32 +8,30 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
-#include "bootstrap.h"
 
-struct p2pConnectInfo {
-  int rank;
-  int read;
+struct ncclP2pBuff {
   void* directPtr;
   hipIpcMemHandle_t devIpc;
 };
 
+struct p2pConnectInfo {
+  int rank;
+  int read;
+  struct ncclP2pBuff p2pBuff;
+};
+static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
+
 struct p2pSendResources {
   struct ncclSendMem* devMem;
-  void* ipcPtr;
   uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-  int remoteId;
-  int memRank;
-  void* remIpcPtr;
-  void* bootstrap;
+  void* sendMemIpc;
+  void* recvMemIpc;
 };
 
 struct p2pRecvResources {
   struct ncclRecvMem* devMem;
-  void* ipcPtr;
-  int remoteId;
-  int memRank;
-  void* remIpcPtr;
-  void* bootstrap;
+  void* sendMemIpc;
+  void* recvMemIpc;
 };
 
 #include <sys/types.h>
@@ -103,15 +101,22 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #else
   // Check that legacy IPC support is available
   if (p2p != 0) {
+    // Cached result of the legacyIPC detection
+    static int legacyIPC = -1;
+    if (legacyIPC >= 0) {
+      *ret = legacyIPC;
+      return ncclSuccess;
+    }
+    // Check that legacy IPC support is available (WSL WAR)
     char *dummy;
-    cudaIpcMemHandle_t ipc;
+    hipIpcMemHandle_t ipc;
     NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
-    if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
-      INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)",
-           cudaDev1, info1->busId);
+    if (hipIpcGetMemHandle(&ipc, dummy) != hipSuccess) {
+      INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
       *ret = 0;
     }
-    CUDACHECK(cudaFree(dummy));
+    CUDACHECK(hipFree(dummy));
+    legacyIPC = *ret;
     return ncclSuccess;
   }
 #endif
@@ -132,6 +137,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
   } while (0)
 
+
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
 
@@ -146,7 +152,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
   return ncclSuccess;
 }
 
-static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
+static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
   if (myInfo->pidHash == peerInfo->pidHash) {
     if (peerInfo->cudaDev != myInfo->cudaDev) {
       // Enable P2P access
@@ -159,10 +165,10 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
         return ncclInternalError;
       }
     }
-    *devMem = p2pInfo->directPtr;
+    *devMem = p2pBuff->directPtr;
     *ipcPtr = NULL;
   } else {
-    CUDACHECK(hipIpcOpenMemHandle(devMem, p2pInfo->devIpc, hipIpcMemLazyEnablePeerAccess));
+    CUDACHECK(hipIpcOpenMemHandle(devMem, p2pBuff->devIpc, hipIpcMemLazyEnablePeerAccess));
     *ipcPtr = *devMem;
   }
   return ncclSuccess;
@@ -188,44 +194,40 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
   }
 
-  struct p2pConnectInfo info;
-  // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
-  info.read = (connIndex == 0) ? useRead : 0;
-  const char* useReadStr = info.read ? "/read" : "";
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  info->read = useRead;
+  // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+  if (graph && connIndex == 1) info->read = 0;
+  const char* useReadStr = info->read ? "/read" : "";
 
   int sendSize = sizeof(struct ncclSendMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
+  if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
   ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
 
-  resources->remoteId = -1;
-  resources->bootstrap = comm->bootstrap;
   if (intermediateRank == -1) {
-    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true));
-    info.rank = myInfo->rank;
+    info->rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     } else {
-      send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
+      send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
       INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     }
   } else {
-    NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
-    info.rank = intermediateRank;
-    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
+    info->rank = intermediateRank;
+    INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
         channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
-	comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
+	comm->peerInfo[intermediateRank].busId, useReadStr);
   }
-  resources->memRank = info.rank;
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
   return ncclSuccess;
 }
 
@@ -238,36 +240,32 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   int useRead, intermediateRank;
   NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
 
-  struct p2pConnectInfo info;
-  // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
-  info.read = (connIndex == 0) ? useRead : 0;
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  info->read = useRead;
+  // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+  if (graph && connIndex == 1) info->read = 0;
 
-  int recvSize = offsetof(struct ncclRecvMem, buff);
+  int recvSize = sizeof(struct ncclRecvMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info.read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
   ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
 
-  resources->remoteId = -1;
-  resources->bootstrap = comm->bootstrap;
   if (intermediateRank == -1) {
-    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true));
-    info.rank = myInfo->rank;
+    info->rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
-      recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
+      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
   } else {
-    NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
-    info.rank = intermediateRank;
+    info->rank = intermediateRank;
   }
-  resources->memRank = info.rank;
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
   return ncclSuccess;
 }
 
@@ -277,16 +275,16 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
 
-  int offset = 0;
+  char* buff = (char*)(remDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
-      send->conn.buffs[p] = resources->devMem->buff;
+      send->conn.buffs[p] = (char*)(resources->devMem+1);
     } else {
-      send->conn.buffs[p] = remDevMem->buff + offset;
-      offset += send->comm->buffSizes[p];
+      send->conn.buffs[p] = buff;
+      buff += send->comm->buffSizes[p];
     }
   }
   send->conn.tail = &remDevMem->tail;
@@ -303,16 +301,16 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
 
-  int offset = 0;
+  char* buff = (char*)(resources->devMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
-      recv->conn.buffs[p] = remDevMem->buff;
+      recv->conn.buffs[p] = (char*)(remDevMem+1);
     } else {
-      recv->conn.buffs[p] = resources->devMem->buff + offset;
-      offset += recv->comm->buffSizes[p];
+      recv->conn.buffs[p] = buff;
+      buff += recv->comm->buffSizes[p];
     }
   }
   recv->conn.tail = &resources->devMem->tail;
@@ -322,39 +320,49 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   return ncclSuccess;
 }
 
-ncclResult_t p2pSendFree(void* resources) {
-  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
-  if (sendRes->ipcPtr)
-    CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
-  if (sendRes->remIpcPtr)
-    CUDACHECK(hipIpcCloseMemHandle(sendRes->remIpcPtr));
-  if (sendRes->remoteId != -1) {
-    NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
-    sendRes->devMem = NULL;
-  }
-  CUDACHECK(hipFree(sendRes->devMem));
-  free(sendRes);
+ncclResult_t p2pSendFree(struct ncclConnector* send) {
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+  if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
+  if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
+  free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t p2pRecvFree(void* resources) {
-  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
-  if (recvRes->ipcPtr)
-    CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
-  if (recvRes->remIpcPtr)
-    CUDACHECK(hipIpcCloseMemHandle(recvRes->remIpcPtr));
-  if (recvRes->remoteId != -1) {
-    NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
-    recvRes->devMem = NULL;
+ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+  if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
+  if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
+  free(resources);
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(int)) return ncclInternalError;
+  int size = *((int*)reqBuff);
+  if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+  struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+  NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, true));
+  connection->transportResources = p2pBuff->directPtr;
+  hipError_t res = hipIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+  if (res != hipSuccess) {
+    WARN("hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
+    hipFree(p2pBuff->directPtr);
+    free(p2pBuff);
+    CUDACHECK(res);
   }
-  CUDACHECK(hipFree(recvRes->devMem));
-  free(recvRes);
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  // Do not check return code as CUDA may have already shut down
+  hipFree(connection->transportResources);
   return ncclSuccess;
 }
 
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
 };
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index af20188981..974a2ab621 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,12 +8,10 @@
 #include "shm.h"
 
 struct shmConnectInfo {
-  uint64_t pidHash;
-  int id;
-  int sendRank;
-  int recvRank;
+  char shmName[7];
   int shmSize;
 };
+static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
 
 struct shmSendResources {
   int remShmSize;
@@ -63,22 +60,17 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
 
-  struct shmConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = myInfo->rank;
-  info.recvRank = peerInfo->rank;
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  char shmPath[PATH_MAX];
+  shmPath[0] = '\0';
+  info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
 
-  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d", channelId, myInfo->rank, 
-		  myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
-  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
+  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
   return ncclSuccess;
 }
 
@@ -87,22 +79,18 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
 
-  struct shmConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = peerInfo->rank;
-  info.recvRank = myInfo->rank;
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  int shmSize = offsetof(struct ncclRecvMem, buff);
+  char shmPath[PATH_MAX];
+  shmPath[0] = '\0';
+  int shmSize = sizeof(struct ncclRecvMem);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
-  info.shmSize = resources->shmSize = shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  info->shmSize = resources->shmSize = shmSize;
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
 
-  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
   return ncclSuccess;
 }
 
@@ -112,18 +100,18 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  char shmPath[PATH_MAX];
+  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
   resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   // Remove the file to ensure proper clean-up
-  NCCLCHECK(shmUnlink(shmName));
+  NCCLCHECK(ncclShmUnlink(shmPath));
 
   send->transportResources = resources;
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = resources->devRemHostMem->buff + offset;
+    send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
     offset += send->comm->buffSizes[p];
   }
   send->conn.tail = &resources->devRemHostMem->tail;
@@ -137,35 +125,35 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  char shmPath[PATH_MAX];
+  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
   resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  NCCLCHECK(shmUnlink(shmName));
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  NCCLCHECK(ncclShmUnlink(shmPath));
   recv->conn.head = &resources->devRemHostMem->head;
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = resources->devHostMem->buff + offset;
+    recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
     offset += recv->comm->buffSizes[p];
   }
   recv->conn.tail = &resources->devHostMem->tail;
   return ncclSuccess;
 }
 
-ncclResult_t shmSendFree(void* transportResources) {
-  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmSendFree(struct ncclConnector* send) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
+  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
   free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvFree(void* transportResources) {
-  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
   free(resources);
   return ncclSuccess;
 }
@@ -173,6 +161,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 struct ncclTransport shmTransport = {
   "SHM",
   shmCanConnect,
-  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
-  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h
index d249dd3697..068d669595 100644
--- a/tools/topo_expl/include/utils.h
+++ b/tools/topo_expl/include/utils.h
@@ -8,12 +8,6 @@
 #ifndef UTILS_H_
 #define UTILS_H_
 
-struct allGather1Data_t {
-  struct ncclPeerInfo peerInfo;
-  struct ncclComm* comm;
-  int cudaCompCap;
-};
-
 // AllGather3 - begin
 struct ncclGraphInfo {
   int pattern;
@@ -26,6 +20,7 @@ struct ncclGraphInfo {
 };
 
 struct allGather3Data_t{
+  int netDev;
   int collNetSupport;
   int nc;
   struct ncclGraphInfo tree;
@@ -40,9 +35,9 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem**
 
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
 
-ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data);
+ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash);
 
-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data,
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
 
 ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp
index 3285bb68f4..969ffbc6c3 100644
--- a/tools/topo_expl/model.cpp
+++ b/tools/topo_expl/model.cpp
@@ -66,6 +66,10 @@ ncclNet_t ncclNetDummy = {
 
 ncclNet_t* ncclNet = &ncclNetDummy;
 
+int ncclNetVersion() {
+  return 4;
+}
+
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 int busIdToCudaDev(int64_t busId) {
   return node_model->busIdToCudaDev(busId);
@@ -142,6 +146,19 @@ struct ncclTransport shmTransport = {
   { shmRecvSetup, NULL, NULL, NULL }
 };
 
+NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+
+struct setupReq {
+  int rank;
+  int localRank;
+  int remoteRank;
+  int shared;
+  int netDev;
+  int useGdr;
+  int channelId;
+  int connIndex;
+};
+
 /* Determine if two peers can communicate with NET */
 ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = node_model->netCanConnect(info1->rank, info2->rank);
@@ -149,38 +166,47 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 }
 
 ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  int netDev, useGdr = 0;
+  struct setupReq req;
 
-  netDev = -1;
-  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &netDev));
-  if (netDev < 0) {
-    // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-    int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
-    NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev));
+  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
+  req.netDev = -1;
+
+  int proxyRank = myInfo->rank;
+  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev));
+  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+
+  if (proxyRank == myInfo->rank) {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  } else {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr));
-
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
-      useGdr ? "/GDRDMA" : "");
+  *((int*)connectInfo) = proxyRank;
   return ncclSuccess;
 }
 
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 
 ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  int netDev, useGdr = 0;
+ struct setupReq req;
 
-  netDev = -1;
-  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev));
-  if (netDev < 0) {
-    // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-    int nicRR = comm->cudaDev;
-    NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev));
-  }
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr));
+  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
+  req.netDev = -1;
 
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
-      useGdr ? "/GDRDMA" : "");
+  // Use myInfo->rank as the receiver uses its own NIC
+  int proxyRank = myInfo->rank;
+  if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev));
+  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
@@ -198,9 +224,9 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
 }
 
 ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  int netDev, useGdr = 0;
+  int netDev, useGdr = 0, proxy;
 
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netDev, &proxy));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr));
 
   INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : "");
@@ -208,9 +234,9 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 }
 
 ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  int netDev, useGdr = 0;
+  int netDev, useGdr = 0, proxy;
 
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netDev, &proxy));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr));
 
   INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : "");
diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp
index da70b9c021..e4fe37e251 100644
--- a/tools/topo_expl/topo_expl.cpp
+++ b/tools/topo_expl/topo_expl.cpp
@@ -195,12 +195,17 @@ int main(int argc,char* argv[])
 
   NCCLCHECK(ncclCalloc(&comm, nranks));
 
-  struct allGather1Data_t *allGather1Data;
-  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  struct ncclPeerInfo *peerInfo;
+  NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root
 
   struct allGather3Data_t *allGather3Data;
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
 
+  struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
+  NCCLCHECK(ncclCalloc(&treeGraph, nranks));
+  NCCLCHECK(ncclCalloc(&ringGraph, nranks));
+  NCCLCHECK(ncclCalloc(&collNetGraph, nranks));
+
   for (int i = 0; i < nranks; i++) {
     comm[i].rank = i;
     comm[i].nRanks = nranks;
@@ -211,22 +216,18 @@ int main(int argc,char* argv[])
     NCCLCHECK(ncclCalloc(&comm[i].p2pRecvs, comm->nRanks));
     node_model = network.GetNode(i);
     assert(node_model!=0);
+    comm[i].busId = node_model->getGpuBusId(i);
     comm[i].topo = node_model->getSystem(i);
-    bootstrapAllGather(&comm[i], allGather1Data);
+    comm[i].peerInfo = peerInfo;
     // Mark channels as non initialized.
     for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
-    NCCLCHECK(ncclCalloc((uint32_t**)&comm[i].p2pNet, 1));
-    NCCLCHECK(ncclCalloc(&comm[i].rankToIntraNodeRank, comm->nRanks));
+    NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
   }
 
-  struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
-  NCCLCHECK(ncclCalloc(&treeGraph, nranks));
-  NCCLCHECK(ncclCalloc(&ringGraph, nranks));
-  NCCLCHECK(ncclCalloc(&collNetGraph, nranks));
   for (int i = 0; i < nranks; i++) {
     node_model = network.GetNode(i);
     assert(node_model!=0);
-    initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
+    initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
   }
 
   for (int i = 0; i < nranks; i++) {
@@ -246,7 +247,7 @@ int main(int argc,char* argv[])
   free(ringGraph);
   free(collNetGraph);
   free(allGather3Data);
-  free(allGather1Data);
+  free(peerInfo);
 
   free(comm);
   printf("Done generating topology using %d: %s\n", model_id, desc->description);
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index 05285892d0..42f93cad02 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -37,10 +37,8 @@ const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
 extern NodeModel *node_model;
 
-NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
-RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
 NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
 
 thread_local int ncclDebugNoWarn = 0;
@@ -111,11 +109,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   vsprintf(buffer+len, fmt, args);
   va_end(args);
   printf("%s\n", buffer);
+#if 0
   if (level == NCCL_LOG_WARN) {
     fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n",
             node_model->nodeId, node_model->currRank, filefunc, line);
     abort();
   }
+#endif
 }
 
 ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system) {
@@ -128,20 +128,6 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem**
 }
 
 
-ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) {
-  // AllGather1 - begin
-  allGather1Data[comm->rank].comm = comm;
-  allGather1Data[comm->rank].cudaCompCap = 1;
-  allGather1Data[comm->rank].peerInfo.rank = comm->rank;
-  allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank);
-  allGather1Data[comm->rank].peerInfo.gdrSupport = 1;
-  allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash;
-  allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash;
-  allGather1Data[comm->rank].peerInfo.shmDev = 0x19;
-  allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(comm->rank);
-  return ncclSuccess;
-}
-
 void initCollNet() {
   if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0)
     ncclCollNet = (ncclCollNet_t*)0x12345678;
@@ -182,6 +168,30 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
   return ncclSuccess;
 }
 
+ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
+  info->rank = comm->rank;
+  info->cudaDev = node_model->rankToCudaDev(comm->rank);
+  info->hostHash = node_model->hostHash;
+  info->pidHash = node_model->pidHash;
+
+  // Get the device MAJOR:MINOR of /dev/shm so we can use that
+  // information to decide whether we can use SHM for inter-process
+  // communication in a container environment
+  //struct stat statbuf;
+  //SYSCHECK(stat("/dev/shm", &statbuf), "stat");
+  info->shmDev = 0x19;
+
+  info->busId = node_model->getGpuBusId(comm->rank);
+
+  // detect if fine grained memory is available on this GPU
+  info->hasFineGrain = true;
+  info->gdrSupport = 1;
+
+  info->comm = comm;
+  info->cudaCompCap = 1;
+  return ncclSuccess;
+}
+
 static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
   TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
   NCCLCHECK(initChannel(comm, channelId));
@@ -230,8 +240,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
       return ncclSuccess;
     }
   }
-  WARN("No transport found !");
-  return ncclInternalError;
+  WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+  return ncclSystemError;
 }
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
@@ -250,12 +260,19 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
   return ncclSuccess;
 }
 
+void dumpData(struct ncclConnect* data, int ndata) {
+  for (int n=0; n<ndata; n++) {
+    printf("[%d] ", n);
+    uint8_t* d = (uint8_t*)data;
+    for (int i=0; i<sizeof(struct ncclConnect); i++) printf("%02x", d[i]);
+    printf("\n");
+  }
+}
+
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
-#if CUDART_VERSION >= 11030
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
-  hipStream_t transportSetupStream;
-  CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
-#endif
+  //hipStream_t transportSetupStream;
+  //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
   int highestType = TRANSPORT_P2P;  // track highest transport type
 
   struct ncclConnect data[2*MAXCHANNELS];
@@ -302,11 +319,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
         //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
         conn->connected = 1;
-#if CUDART_VERSION >= 11030
         //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
-#else
-        //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-#endif
       }
     }
     for (int c=0; c<MAXCHANNELS; c++) {
@@ -314,19 +327,13 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
         //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
         conn->connected = 1;
-#if CUDART_VERSION >= 11030
         //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
-#else
-        //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
-#endif
       }
     }
     comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
   }
-#if CUDART_VERSION >= 11030
-  CUDACHECK(hipStreamSynchronize(transportSetupStream));
-  CUDACHECK(hipStreamDestroy(transportSetupStream));
-#endif
+  //CUDACHECK(hipStreamSynchronize(transportSetupStream));
+  //CUDACHECK(hipStreamDestroy(transportSetupStream));
   if (highestTransportType != NULL) *highestTransportType = highestType;
   return ncclSuccess;
 }
@@ -422,9 +429,9 @@ cleanup:
 
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
   // AllGather collNet setup results
-  int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
-  allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
-  //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
+  int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0};
+  allGatherFailures[comm->localRank] = collNetSetupFail;
+  //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int)));
   for (int i=0; i<comm->localRanks; i++) {
     if (allGatherFailures[i] != 0) {
       collNetSetupFail = 1;
@@ -432,7 +439,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
     }
   }
   if (collNetSetupFail) {
-    if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
+    if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -457,80 +464,33 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data,
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
+  // We use 2 AllGathers
+  // 1. { peerInfo, comm, compCap}
+  // 2. { nChannels, graphInfo, topoRanks }
+
   int rank = comm->rank;
   int nranks = comm->nRanks;
   //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
   //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+  // [RCCL] Collect the PID of the root
+  int rootPid;
+  //NCCLCHECK(bootstrapInit(commId, comm));
+  // [/RCCL]
 
   // AllGather1 - begin
-  //struct {
-  //  struct ncclPeerInfo peerInfo;
-  //  struct ncclComm* comm;
-  //  int cudaCompCap;
-  //} *allGather1Data;
+  //NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
+  //NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, comm->rank));
+  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
 
-  //NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
-  //allGather1Data[rank].comm = comm;
-  //allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
-  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
-  //NCCLCHECK(fillInfo(comm, myInfo, commHash));
-  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
-  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
   for (int i = 0; i < nranks; i++) {
-    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
-    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
       return ncclInvalidUsage;
     }
   }
 
-  // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
-  int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-  int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
-  int myCompCap = allGather1Data[rank].cudaCompCap;
-  int minCompCap = myCompCap, maxCompCap = myCompCap;
-  for (int i = 0; i < nranks; i++) {
-    if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
-      // Rank is on same node
-      if (intraNodeRanks == 0) intraNodeRank0 = i;
-      if (i == rank) intraNodeRank = intraNodeRanks;
-      comm->intraNodeGlobalRanks[intraNodeRanks] = i;
-      comm->rankToIntraNodeRank[i] = intraNodeRanks;
-      intraNodeRanks++;
-      if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
-        // Rank is in same process
-        if (intraProcRanks == 0) intraProcRank0 = i;
-        if (i == rank) intraProcRank = intraProcRanks;
-        intraProcRanks++;
-      }
-    }
-    minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
-    maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0);
-  TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-        rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0);
-  if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) {
-    WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraProcRank, intraProcRanks, intraProcRank0);
-    return ncclInternalError;
-  }
-  if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) {
-    WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraNodeRank, intraNodeRanks, intraNodeRank0);
-    return ncclInternalError;
-  }
-  struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
-  uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
-  comm->intraNodeRank = intraNodeRank;
-
   // AllGather1 - end
 
   // Topo detection / System graph creation
@@ -550,11 +510,23 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   // Print final topology
   NCCLCHECK(ncclTopoPrint(comm->topo));
 
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  //NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
+  //cpu_set_t affinitySave;
+ // if (CPU_COUNT(&comm->cpuAffinity)) {
+    //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+    //sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  //}
+  ncclResult_t ret;
+
+  // Launch proxy service thread
+  //NCCLCHECK(ncclProxyCreate(comm));
+
   // Get rings and trees
   //struct ncclTopoGraph ringGraph;
   ringGraph.id = 0;
   ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.crossNic = ncclParamCrossNic();
   ringGraph.collNet = 0;
   ringGraph.minChannels = 1;
   ringGraph.maxChannels = MAXCHANNELS/2;
@@ -564,7 +536,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   //struct ncclTopoGraph treeGraph;
   treeGraph.id = 1;
   treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
   treeGraph.collNet = 0;
   treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
   treeGraph.maxChannels = ringGraph.nChannels;
@@ -575,56 +546,55 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   collNetGraph.id = 2;
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
-  collNetGraph.crossNic = ncclParamCrossNic();
-  collNetGraph.minChannels = 1;
-  collNetGraph.maxChannels = ringGraph.nChannels;
+  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
   NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
   NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
 
-  bool allXgmi = true;
+  bool allXgmi = true, hasPeerAccess = true;
+  // Check that all the GPUs have peer access to one another and are XGMI connected
+  for (int i = 0; i < nranks && hasPeerAccess; i++) {
+    int cudaDev1 = comm->peerInfo[i].cudaDev;
+    for (int j = 0; j < nranks; j++) {
+      if (i == j) continue;
+      int cudaDev2 = comm->peerInfo[j].cudaDev;
+      int p2p;
+      if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
+      {
+        hasPeerAccess = false;
+        break;
+      }
+
+      bool isXGMI;
+      // Limit to single intermediate GPU for enabling clique
+      NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1));
+      allXgmi &= isXGMI;
+    }
+  }
+
+#if 0
   { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
-    //CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
+    CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
     if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)
     {
-      // Check that all the GPUs have peer access to one another and are XGMI connected
-      bool hasPeerAccess = true;
-      for (int i = 0; i < nranks && hasPeerAccess; i++)
+      if (hasPeerAccess)
       {
-        int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
-        for (int j = 0; j < nranks; j++)
-        {
-          if (i == j) continue;
-          int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
-          //int p2p;
-          //if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
-          //{
-          //  hasPeerAccess = false;
-          //  break;
-          //}
-
-          bool isXGMI;
-          NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1));
-          allXgmi &= isXGMI;
-        }
+        if (intraProcRanks == nranks)
+          cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
+        else
+          cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
       }
-      //if (hasPeerAccess)
-      //{
-      //  if (intraRanks == nranks)
-      //    cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
-      //  else
-      //    cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
-      //}
 
       // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
-      //if (!allXgmi && !rcclParamCliqueIgnoreTopo())
-      //{
-      //  INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
-      //  cliqueMode = CliqueManager::CLIQUE_DISABLED;
-      //}
+      if (!allXgmi && !rcclParamCliqueIgnoreTopo())
+      {
+        INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
+        cliqueMode = CliqueManager::CLIQUE_DISABLED;
+      }
     }
-    //comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
-    //NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
+    comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
+    NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
   } // [/RCCL]
+#endif
 
   if (comm->rank == ncclParamGraphDumpFileRank()) {
     struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
@@ -633,19 +603,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
 
   // Determine local CollNet support before all-gather
   if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
-  if (intraNodeRanks > 8) {
-    if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
-    comm->collNetSupport = 0;
-  }
 
-  if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
-    if (rcclParamP2pNetDisable() == 0) {
-      comm->p2pNet = 1;
-      INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
-    }
-    else
-      INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
-  }
   // AllGather3 - begin
 #if 0
   struct ncclGraphInfo {
@@ -659,6 +617,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   };
 
   struct {
+    int netDev;
     int collNetSupport;
     int nc;
     struct ncclGraphInfo tree;
@@ -670,7 +629,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
 #endif
   int idx;
-  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
     allGather3Data[rank].nc = 4;
@@ -684,6 +643,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
+  NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
   allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -717,25 +677,57 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
   int rank = comm->rank;
   int nranks = comm->nRanks;
+  ncclResult_t ret;
   //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
 
   // Determine nNodes, firstRanks, ...
   int *nodesFirstRank, *nodesTreePatterns;
   NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
   NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
-  for (int i=0; i<nranks; i++) {
-    int node = -1;
-    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
-    for (int n=0; n<comm->nNodes; n++) {
-      if (nodesFirstRank[n] == firstRank) node = n;
-    }
-    if (node == -1) {
-      node = comm->nNodes++;
+  NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
+  for (int r=0; r<nranks; r++) {
+    int node;
+    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+    if (node == comm->nNodes) {
+      comm->nNodes++;
       nodesFirstRank[node] = firstRank;
       // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
+      nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
     }
-    if (i == comm->rank) comm->node = node;
+    comm->rankToNode[r] = node;
+  }
+  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+  NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
+  NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+    comm->nodeRanks[node].localRanks++;
+  }
+  // Allocate ranks arrays for each node
+  for (int n=0; n<comm->nNodes; n++) {
+    NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
+    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->nodeRanks[n].localRanks = 0;
+  }
+  // And fill the ranks arrays
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+  }
+  comm->node = comm->rankToNode[rank];
+  comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+  comm->localRank = comm->rankToLocalRank[rank];
+  comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+  TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+  if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+         rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+         comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+    return ncclInternalError;
   }
 
   int nChannelsOrig = comm->nChannels;
@@ -743,6 +735,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
   int nc = allGather3Data[0].nc;
   for (int i=0; i<nranks; i++) {
+    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     nc = std::min(allGather3Data[i].nc, nc);
     // Make sure we align all ranks so that the tuning is consistent across ranks
@@ -750,20 +743,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
     treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
     treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
-    treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
+    treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+    treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
     ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
     ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
     ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
-    ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
+    ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
     collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
     collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
     collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
     collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
-    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
-    collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
+    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
     comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
   }
 
@@ -776,12 +769,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
     for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
   }
 
-  // Determine CollNet support after all-gather now that we know nNodes
-  int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
-  if (comm->nNodes < collNetNodeThreshold) {
-    if (comm->collNetSupport == 1)
+  // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+  if (comm->collNetSupport == 1) {
+    int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+    if (comm->nNodes < collNetNodeThreshold) {
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
-    comm->collNetSupport = 0;
+      comm->collNetSupport = 0;
+    }
+    for (int n=0; n<comm->nNodes; n++) {
+      if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
+        WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
+        comm->collNetSupport = 0;
+        break;
+      }
+    }
   }
 
   int *rings;
@@ -808,16 +809,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s", line);
 
-  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
-  // on the host is local.
-  //NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
-  //cpu_set_t affinitySave;
-  //if (CPU_COUNT(&comm->cpuAffinity)) {
-  //  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  //  sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  //}
-  ncclResult_t ret;
-
   //NCCLCHECK(computeBuffSizes(comm));
 
   // Connect with prev/next for each ring
@@ -828,7 +819,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
   }
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
-  if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
+  if (ringGraph.nIntraChannels) {
     comm->useIntraNet = 1;
     // Connect NET for intranode use
     for (int c=0; c<comm->nChannels; c++) {
@@ -854,7 +845,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   // Check if we can setup CollNet
   if (comm->collNetSupport > 0) {
     int collNetSetupFail = 0;
-    int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
+    int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
     // Find all head ranks
     int nHeads = collNetGraph.nChannels;
     int *heads;
@@ -894,13 +885,13 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
 
     // Exchange highest intra-node transport type among ranks
     // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-    comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-    //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
-    //for (int i=0; i<comm->localRanks; i++) {
-      //if (highestTypes[i] > comm->intraHighestTransportType)
-        //comm->intraHighestTransportType = highestTypes[i];
-    //}
-    INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
+    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+    //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
+    for (int i=0; i<comm->localRanks; i++) {
+      if (highestTypes[i] > comm->intraHighestTransportType)
+        comm->intraHighestTransportType = highestTypes[i];
+    }
+    INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks);
 
 collnet_cleanup:
     free(heads);
@@ -913,19 +904,96 @@ collnet_cleanup:
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
-  NCCLCHECK(ncclTopoTuneModel(comm, 1, 1, &treeGraph, &ringGraph, &collNetGraph));
+  do {
+    int myCompCap = comm->peerInfo[rank].cudaCompCap;
+    int minCompCap = myCompCap, maxCompCap = myCompCap;
+    for (int i = 0; i < nranks; i++) {
+      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
+      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
+    }
+    NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+  } while(0);
 
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
+#if 0
+  if (ncclParamNvbPreconnect()) {
+    // Connect p2p when using NVB path
+    int nvbNpeers;
+    int* nvbPeers;
+    NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
+    for (int r=0; r<nvbNpeers; r++) {
+      int peer = nvbPeers[r];
+      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
+          comm->connectRecv[peer] |= (1<<channelId);
+        }
+      }
+      delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
+          comm->connectSend[peer] |= (1<<channelId);
+        }
+      }
+    }
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
+    free(nvbPeers);
+  }
+#endif
+  // Connect to local net proxy
+  struct ncclProxyConnector proxyConn;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
+  //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
+  //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
 
-  //NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
+  // Then to remote ones when using PXN
+  if (ncclPxnDisable() == 0) {
+    int nranks;
+    int* pxnPeers;
+    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
+    for (int r=0; r<nranks; r++) {
+      //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
+      //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+    }
+    free(pxnPeers);
+  }
 
-  //if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+      }
+    }
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      return ncclInternalError;
+    }
+    //NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+  } while(0);
+
+  /* Local intra-node barrier */
+  //NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
+
+  // Unlink proxy shm to make sure it will be properly cleaned up.
+  //NCCLCHECK(ncclProxyShmUnlink(comm));
 
   // We should have allocated all buffers, collective fifos, ... we can
   // restore the affinity.
 affinity_restore:
-  //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   if (ret != ncclSuccess) return ret;
 
   TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);