diff --git a/CMakeLists.txt b/CMakeLists.txt
index c96639f336..e9111cac6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,6 +317,8 @@ set(SRC_FILES
   src/include/git_version.h
   src/include/graph.h
   src/include/group.h
+  src/include/ibvcore.h
+  src/include/ibvsymbols.h
   src/include/ibvwrap.h
   src/include/info.h
   src/include/ipcsocket.h
@@ -379,6 +381,7 @@ set(SRC_FILES
   src/misc/argcheck.cc
 # src/misc/cudawrap.cc
 # src/misc/gdrwrap.cc
+  src/misc/ibvsymbols.cc
   src/misc/ibvwrap.cc
   src/misc/ipcsocket.cc
   src/misc/msccl/msccl_lifecycle.cc
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 35d1826e3f..60a019c0b2 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -12,6 +12,7 @@ DEBUG ?= 0
 TRACE ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
+RDMA_CORE ?= 0
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -106,3 +107,7 @@ endif
 ifneq ($(PROFAPI), 0)
 CXXFLAGS += -DPROFAPI
 endif
+
+ifneq ($(RDMA_CORE), 0)
+CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
+endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 6877b63a09..ba162237d4 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 17
-NCCL_PATCH   := 1
+NCCL_MINOR   := 18
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index ca5ddce466..dd5754989e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
-		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
+		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
 		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		misc/ipcsocket.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index e542e26c87..fdbb7d04c8 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -222,7 +222,6 @@ struct bootstrapState {
   int cudaDev;
   int rank;
   int nranks;
-  int virtualId;
   uint64_t magic;
   volatile uint32_t *abortFlag;
 };
@@ -230,7 +229,6 @@ struct bootstrapState {
 ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  int virtualId = comm->virtualId;
   struct bootstrapState* state;
   struct ncclSocket* proxySocket;
   ncclSocketAddress nextAddr;
@@ -241,11 +239,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
   state->rank = rank;
   state->nranks = nranks;
   state->abortFlag = comm->abortFlag;
-  state->virtualId = virtualId;
   comm->bootstrap = state;
   comm->magic = state->magic = handle->magic;
 
-  TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
 
   // [RCCL] Register custom signal handlers if requested
   RegisterSignalHandlers();
@@ -308,11 +305,79 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
   NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
   NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
 
-  TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
   return ncclSuccess;
 }
 
+ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
+  ncclResult_t ret = ncclSuccess;
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  int prev, next;
+  ncclSocketAddress listenAddr, tmpAddr;
+  struct ncclSocket* proxySocket;
+  struct bootstrapState* state;
+
+  NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
+  state->rank = rank;
+  state->nranks = nranks;
+  state->abortFlag = comm->abortFlag;
+  comm->bootstrap = state;
+  comm->magic = state->magic = handle->magic;
+
+  prev = parentRanks[(rank-1+nranks)%nranks];
+  next = parentRanks[(rank+1)%nranks];
+
+  // Setup my sockets for the allgather ring and other p2p connections
+  NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
+
+  // Create socket for other ranks to contact me
+  NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail);
+
+  // Get addr from next rank
+  NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
+  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail);
+  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail);
+
+  NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
+  NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail);
+  // Accept the connect request from the previous rank in the AllGather ring
+  NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
+
+  // AllGather all listen handlers
+  NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail);
+  memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail);
+
+  if (parent->config.splitShare) {
+    /* map local rank to top parent local rank. */
+    for (int i = 0; i < nranks; ++i) {
+      comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
+    }
+    comm->proxyState = parent->sharedRes->proxyState;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+  } else {
+    // Create the service proxy
+    NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
+    NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
+    NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail);
+    NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail);
+    NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
+    memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
+    NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
+    NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
+  }
+
+  INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
   struct bootstrapState* state = (struct bootstrapState*)commState;
   char* data = (char*)allData;
@@ -344,7 +409,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
   struct bootstrapState* state = (struct bootstrapState*)commState;
   struct ncclSocket sock;
 
-  NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
   NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
   NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
   NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
@@ -405,7 +470,7 @@ ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank,
     }
   }
   else {
-    NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
+    NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
diff --git a/src/channel.cc b/src/channel.cc
index ed4c623d30..5a06029d9f 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -17,32 +17,122 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   channel->id = channelId;
   channel->workFifoSent = 0;
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
 
-  // The extra on nRanks+1 is for collnet root (i.e. network)
-  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
-  ncclCommPushCudaFree(comm, channel->devPeers);
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
 
-  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.cudaStream));
-  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
-
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
-  CUDACHECK(hipEventRecord(comm->deviceStream.scratchEvent, comm->deviceStream.cudaStream));
-  CUDACHECK(hipStreamWaitEvent(comm->deviceStream.cudaStream, comm->deviceStream.scratchEvent, 0));
-
-  for (int r=0; r < nPeers; ++r) {
-    for (int b=0; b < NCCL_MAX_CONNS; b++) {
-      channel->peers[r].send[b].comm = comm;
-      channel->peers[r].recv[b].comm = comm;
+  if (channel->peers == NULL) {
+    // The extra on nRanks+1 is for collnet root (i.e. network)
+    // Allocate everything related to sharedRes with ncclCalloc as this can be
+    // shared between communicators hence should not be tied to comm.
+    if (sharedRes->peers[channelId] == NULL) {
+      NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
+    }
+    channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
+    for (int r = 0; r < nRanks; r++) {
+      channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
+      ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
     }
   }
 
+  if (channel->devPeers == NULL) {
+    if (sharedRes->devPeers[channelId] == NULL) {
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+    }
+    /* channel->devPeers is not shared, so just free it when calling commFree() */
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    ncclCommPushCudaFree(comm, channel->devPeers);
+    for (int r = 0; r < nRanks; r++) {
+      uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    }
+  }
+
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+  CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream));
+  CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0));
+
   return ncclSuccess;
 }
 
-ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+
+  if (channel->nvlsPeers != NULL)
+    return ncclSuccess;
+
+  if (channel->id == -1)
+    NCCLCHECK(initChannel(comm, channelId));
+
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+
+  if (share) {
+    channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
+    channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
+    for (int r = 0; r < comm->localRanks; ++r) {
+      int tr = comm->topParentLocalRanks[r];
+      uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
+      channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
+    }
+  } else {
+    NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream));
+    for (int r = 0; r < comm->localRanks; ++r) {
+      uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
+      channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
+    }
+  }
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+
+  return ncclSuccess;
+}
+
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  uintptr_t addr;
+
+  if (channel->collnetPeers != NULL)
+    return ncclSuccess;
+
+  if (channel->id == -1)
+    NCCLCHECK(initChannel(comm, channelId));
+
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+
+  if (share) {
+    channel->collnetPeers = parent->channels[channelId].collnetPeers;
+    channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
+    addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
+    channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
+  } else {
+    NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
+    addr = (uintptr_t)channel->collnetDevPeers;
+    channel->peers[comm->nRanks] = channel->collnetPeers;
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
+  }
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
+  int nPeers = nRanks + collnetNRanks + nvlsNRanks;
   /* channel peers are only valid when async init thread completes commAlloc() and
    * the channel is intialized with initChannel(); if either is not done, this channel
    * should never be free. */
@@ -50,18 +140,23 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
 
   // Free transport proxy resources
   // Note: free all send resources first due to CollNet arrangement
-  for (int r=0; r<nRanks+1; r++) {
-    struct ncclChannelPeer* peer = channel->peers+r;
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
+  for (int r = 0; r < nPeers; r++) {
+    struct ncclChannelPeer* peer = channel->peers[r];
+    if (peer) {
+      if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
+          if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
+        }
+        if (r == nRanks) {
+          free(channel->collnetPeers);
+          ncclCudaFree(channel->collnetDevPeers);
+        } else if (r == nPeers - 1) {
+          free(channel->nvlsPeers);
+          ncclCudaFree(channel->nvlsDevPeers);
+        }
+      }
     }
   }
-  for (int r=0; r<nRanks+1; r++) {
-    struct ncclChannelPeer* peer = channel->peers+r;
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
-    }
-  }
-
   return ncclSuccess;
 }
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index dbfca9b082..e4ad1964b4 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -51,7 +51,7 @@ namespace {
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
 
 #if defined(ENABLE_NPKIT)
     if (tid == 0) {
@@ -85,7 +85,7 @@ namespace {
       if (inputBuf + chunkOffset == outputBuf + offset) { // In place
         prims.directSend(chunkOffset, offset, nelem);
       } else {
-        prims.directCopySend(chunkOffset, offset, offset, nelem);
+        prims.directCopySend(chunkOffset, offset, nelem);
       }
 
       // k-2 steps: copy to next GPU
@@ -93,7 +93,7 @@ namespace {
         rankDest = ringRanks[nranks-j];
         offset = chunkOffset + rankDest * size;
 
-        prims.directRecvCopySend(offset, offset, nelem);
+        prims.directRecvCopySend(offset, nelem);
       }
 
       // Make final copy from buffer to dest.
@@ -148,19 +148,19 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
 
     if (tid < tidEndGather) {
       // Gather
-      int group = (0*Proto::MaxGroupWidth) | (0<<16);
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
+        prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
         prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
       }
     } else if (tid < tidEndBcast) {
-      int group = (3*Proto::MaxGroupWidth) | (1<<16);
-      // Bcast through MC
+      // Bcast through NVLS
       Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
+        prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
+           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 57444ab024..658fc30b57 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -66,7 +66,7 @@ namespace {
     }
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
 
 #if defined(ENABLE_NPKIT)
     if (tid == 0) {
@@ -158,7 +158,7 @@ namespace {
       }
 #endif
 
-      prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
+      prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
       if (tid == 0) {
@@ -180,7 +180,7 @@ namespace {
         chunk = modRanks(ringIx + nranks-j);
         offset = calcOffset(chunk);
         nelem = min(realChunkSize, size-offset);
-        prims.directRecvCopySend(offset, offset, nelem);
+        prims.directRecvCopySend(offset, nelem);
       }
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
@@ -342,7 +342,7 @@ namespace {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
           int nelem = min(chunkSize, size-offset);
-          prims.directSendFromOutput(offset, offset, nelem);
+          prims.directSendFromOutput(offset, nelem);
         }
       }
       else if (tree->down[0] == -1) {
@@ -356,7 +356,7 @@ namespace {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
           int nelem = min(chunkSize, size-offset);
-          prims.directRecvCopySend(offset, offset, nelem);
+          prims.directRecvCopySend(offset, nelem);
         }
       }
 
@@ -446,7 +446,7 @@ namespace {
       chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
 
     if (tree->up == -1) {
-      // Reduce and broadcast. Max number of recv is 3, max number of send is 3
+      // Reduce and broadcast. Max number of recv is 2, max number of send is 2
       Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
 
@@ -467,7 +467,7 @@ namespace {
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*int(chunkSize);
         int nelem = min(chunkSize, size-offset);
-        prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true);
+        prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
       }
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
@@ -530,7 +530,8 @@ namespace {
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
+            args->redOpArg, 1*Proto::MaxGroupWidth);
 
 #if defined(ENABLE_NPKIT)
       if (isNpKitThread) {
@@ -557,7 +558,7 @@ namespace {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
           int nelem = min(chunkSize, size-offset);
-          prims.directRecvCopySend(offset, offset, nelem);
+          prims.directRecvCopySend(offset, nelem);
         }
       }
 
@@ -621,9 +622,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
-      int group = (2*Proto::MaxGroupWidth) | (1<<16);
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
+           args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
         int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -634,16 +635,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
         }
       }
     } else if (tid >= tidStartReduce && direct->out != -1) {
-      int group = (3*Proto::MaxGroupWidth) | (1<<16);
       if (hasDn) {
         // Reduce, send to network
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
+             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           if (args->regUsed) {
-            prims.directRecvReduceSend(offset, offset, nelem);
+            prims.directRecvReduceSend(offset, nelem);
           } else {
             prims.recvReduceSend(offset, nelem);
           }
@@ -651,7 +652,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       } else {
         // Directly send to network
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
+             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -660,29 +662,30 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       }
     } else if (tid < tidStartBcast && hasUp) {
       // Gather
-      int group = (0*Proto::MaxGroupWidth) | (0<<16);
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
+           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
         int nelem = min(direct->nHeads*chunkSize, size-offset);
         prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
       }
     } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
-      int group = (1*Proto::MaxGroupWidth) | (0<<16);
       if (hasDn) {
         // Recv from network, broadcast
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
+             args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
-          prims.recvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
+          prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
         }
       } else {
         // Recv from network (no post thread needed)
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
+             args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -705,23 +708,27 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
     const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
     const int nranks = ncclShmem.comm.nRanks;
-    const int reduceWarps = nranks <= 6 ? 6 : 4;
-    const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
+    const bool hasOut = nvls->out != -1;
+    const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
+    const int bcastWarps = hasOut ? 2 : 0;
+    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
+    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
 
-    const int nThreadsScatter = copyWarps*WARP_SIZE;
-    const int nThreadsGather  = (copyWarps-1)*WARP_SIZE;
-    const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
+    const int nThreadsScatter = scatterWarps*WARP_SIZE;
+    const int nThreadsGather  = gatherWarps*WARP_SIZE;
+    const int nThreadsReduce = reduceWarps*WARP_SIZE;
+    const int nThreadsBcast  = (bcastWarps)*WARP_SIZE;
     const int tidEndScatter = nThreadsScatter;
     const int tidEndGather = tidEndScatter + nThreadsGather;
     const int tidEndReduce = tidEndGather + nThreadsReduce;
-
-    using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
+    const int tidEndBcast = tidEndReduce + nThreadsBcast;
 
     if (tid < tidEndScatter) {
       // Scatter
-      int group = (0*Proto::MaxGroupWidth) | (0<<16);
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
         int nelem = min(nvls->nHeads*chunkSize, size-offset);
@@ -729,19 +736,136 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
       }
     } else if (tid < tidEndGather) {
       // Gather
-      int group = (2*Proto::MaxGroupWidth) | (0<<16);
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
         int nelem = min(nvls->nHeads*chunkSize, size-offset);
         prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
       }
-    } else if (tid < tidEndReduce) {
-      int group = (3*Proto::MaxGroupWidth) | (1<<16);
-      // Reduce, broadcast through NVLS
+    } else if (tid < tidEndReduce && nvls->headRank != -1) {
+      if (!hasOut) {
+        // Reduce, broadcast through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
+             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          prims.recvSend(nelem);
+        }
+      } else {
+        // Reduce, send to network
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
+             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          prims.recvSend(nelem);
+        }
+      }
+    } else if (tid < tidEndBcast && nvls->headRank != -1) {
+      // Recv from network, broadcast
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
       Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
+        prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        prims.recvSend(nelem);
+      }
+    }
+  #endif // NCCL_NVLS_ENABLED
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+  #if NCCL_NVLS_ENABLED
+    const int tid = threadIdx.x;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
+    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
+    const int treeUp = nvls->treeUp;
+    const int* treeDown = nvls->treeDown;
+    const ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t size = args->count;
+    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
+    const int nranks = ncclShmem.comm.nRanks;
+    const bool hasUp = treeUp != -1;
+    const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
+    const int bcastWarps = hasUp ? 4 : 0;
+    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
+    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
+
+    const int nThreadsScatter = scatterWarps*WARP_SIZE;
+    const int nThreadsGather  = gatherWarps*WARP_SIZE;
+    const int nThreadsReduce = reduceWarps*WARP_SIZE;
+    const int nThreadsBcast  = (bcastWarps)*WARP_SIZE;
+    const int tidEndScatter = nThreadsScatter;
+    const int tidEndGather = tidEndScatter + nThreadsGather;
+    const int tidEndReduce = tidEndGather + nThreadsReduce;
+    const int tidEndBcast = tidEndReduce + nThreadsBcast;
+
+    if (tid < tidEndScatter) {
+      // Scatter
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
+        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
+      }
+    } else if (tid < tidEndGather) {
+      // Gather
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
+        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
+      }
+    } else if (tid < tidEndReduce && nvls->headRank != -1) {
+      if (!hasUp) {
+        // Reduce and Broadcast
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
+        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
+             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          prims.recvSend(nelem);
+        }
+      } else {
+        // Reduce, send to network
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
+              args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
+          int nelem = min(chunkSize, size-offset);
+          prims.recvSend(nelem);
+        }
+      }
+    } else if (tid < tidEndBcast && nvls->headRank != -1) {
+      // Recv from network, broadcast
+      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
+        prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
+           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
         int nelem = min(chunkSize, size-offset);
@@ -762,21 +886,26 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
     ncclTree *tree = &ncclShmem.channel.collnetChain;
     ssize_t chunkSize = int(args->lastChunkSize);
     const ssize_t loopSize = int(nChannels*chunkSize);
+    const int nranks = ncclShmem.comm.nRanks;
     const ssize_t size = args->count;
 
     int nthreadsSplit = nthreads/2;
     if (nthreadsSplit >= 256) nthreadsSplit += 64;
 
-    int group, send, recv, groupTid, groupNthreads;
+    int group, connIndex, send, recv, groupTid, groupNthreads;
     using Proto = ProtoSimple<1, 1>;
     if (tid < nthreadsSplit) {
-      group = (0*Proto::MaxGroupWidth) | (1<<16);
+      // Reduce up the chain
+      group = 0;
+      connIndex = 1;
       recv = tree->down[0];
       send = tree->up;
       groupTid = tid;
       groupNthreads = nthreadsSplit;
     } else {
-      group = (1*Proto::MaxGroupWidth);
+      // Broadcast down the chain
+      group = 1;
+      connIndex = 0;
       recv = tree->up;
       send = tree->down[0];
       groupTid = tid - nthreadsSplit;
@@ -784,7 +913,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
     }
 
     Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-      prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group);
+      prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
+          args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
 
     if (tid < nthreadsSplit) {
       if (recv == -1) {
@@ -802,17 +932,34 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
       }
     }
     else {
-      if (send == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
-          prims.directRecv(offset, nelem);
+      if (recv == nranks) {
+        // I'm the first in the broadcast chain, I need to perform the division (postOp)
+        if (send == -1) {
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + bid*int(chunkSize);
+            int nelem = min(chunkSize, size-offset);
+            prims.recv(offset, nelem, /*postOp*/true);
+          }
+        } else {
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + bid*int(chunkSize);
+            int nelem = min(chunkSize, size-offset);
+            prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
+          }
         }
       } else {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
-          prims.directRecvCopySend(offset, offset, nelem);
+        if (send == -1) {
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + bid*int(chunkSize);
+            int nelem = min(chunkSize, size-offset);
+            prims.directRecv(offset, nelem);
+          }
+        } else {
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + bid*int(chunkSize);
+            int nelem = min(chunkSize, size-offset);
+            prims.directRecvCopySend(offset, nelem);
+          }
         }
       }
     }
diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h
index 443a369cee..c7912808e6 100644
--- a/src/collectives/device/alltoall_pivot.h
+++ b/src/collectives/device/alltoall_pivot.h
@@ -51,7 +51,7 @@ namespace {
       if (num_hops == 0 && args->sendbuff != args->recvbuff) {
         const T* sendbuff = (const T*)args->sendbuff + send_offset;
         T* recvbuff = (T *)args->recvbuff + recv_offset;
-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(
+        reduceCopy<COLL_UNROLL, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
             tid, nthreads, 0, nullptr, false, 1, (void **)&sendbuff, 1, (void **)&recvbuff, send_recv_size);
       } else {
         for (ssize_t prims_offset = 0; prims_offset < send_recv_size; prims_offset += prims_size) {
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index db3aed51e8..5dc72b5a4c 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -50,7 +50,7 @@ namespace {
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16);
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
 
 #if defined(ENABLE_NPKIT)
     if (tid == 0) {
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e8ec8a57c5..29a951ab11 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -42,7 +42,8 @@
   NCCL_FUNC5(func, RING,    devredop, type, nullify), \
   NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
   NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS, devredop, type, nullify)
+  NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
+  NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(func, devredop, nullForFloat) \
@@ -119,8 +120,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 #endif
 };
 
-static_assert(FUNC_INDEX_P2P == 4510, "Wrong P2P function index");
-static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 4511, "Wrong AllToAllPivot function index");
+static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
+static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");
 
 #ifndef USE_INDIRECT_FUNCTION_CALL
 template<unsigned short f, unsigned short l, bool u>
@@ -180,46 +181,46 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
   else
     assert("Unsupported function index");
 #else
-  if (funcIndex < 900) {
-    if (funcIndex % 15 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
+  if (funcIndex < 1080) {
+    if (funcIndex % 18 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
     else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
   }
-  else if (funcIndex < 1800) Caller<900, 1800, USING_LL128>::call(funcIndex);
-  else if (funcIndex < 2700) {
-    if (funcIndex % 15 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
-    else if (funcIndex % 15 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
-    else if (funcIndex % 15 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
-    else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
-    else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
+  else if (funcIndex < 2160) Caller<1080, 2160, USING_LL128>::call(funcIndex);
+  else if (funcIndex < 3240) {
+    if (funcIndex % 18 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
+    else if (funcIndex % 18 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
+    else if (funcIndex % 18 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
+    else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
+    else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
     else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
   }
-  else if (funcIndex < 4500) Caller<2700, 4500, USING_LL128>::call(funcIndex);
+  else if (funcIndex < 5400) Caller<3240, 5400, USING_LL128>::call(funcIndex);
   else {
-    switch (funcIndex - 4500) {
+    switch (funcIndex - 5400) {
       case 0:
         ncclFunction_OneRankReduce_PreMulSum_int8_t();
         break;
@@ -353,7 +354,6 @@ struct ncclShmemGroup {
   ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
   void* srcs[NCCL_MAX_NVLS_ARITY+1];
   void* dsts[NCCL_MAX_NVLS_ARITY+1];
-  int nvlsRecv;
   uint64_t barrier;
   uint64_t barrier_next[NCCL_MAX_GROUPS];
 };
@@ -621,7 +621,8 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
   IMPL_COLL4(func, RING,    devredop, type) \
   IMPL_COLL4(func, COLLNET_DIRECT, devredop, type) \
   IMPL_COLL4(func, COLLNET_CHAIN, devredop, type) \
-  IMPL_COLL4(func, NVLS, devredop, type)
+  IMPL_COLL4(func, NVLS, devredop, type) \
+  IMPL_COLL4(func, NVLS_TREE, devredop, type)
 
 #define IMPL_COLL2(func, devredop) \
   IMPL_COLL3(func, devredop, int8_t) \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index cd1aa5b6c8..f204aee88e 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -28,7 +28,8 @@ inline __device__ int loadInt(int* ptr) {
 }
 
 template<typename RedFn, typename T, int Unroll, int BytePerPack,
-         int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
+         int MultimemSrcs, int MinSrcs, int MaxSrcs,
+         int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
          typename IntBytes>
 __device__ __forceinline__ void reduceCopyPacks(
     int nThreads, int &thread,
@@ -37,6 +38,7 @@ __device__ __forceinline__ void reduceCopyPacks(
     IntBytes &nBytesBehind, IntBytes &nBytesAhead
   ) {
   static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
+  //if (BytePerPack == 0) __trap();
 
   // A hunk is the amount of contiguous data a warp consumes per loop iteration
   // assuming all threads partake.
@@ -49,15 +51,15 @@ __device__ __forceinline__ void reduceCopyPacks(
   IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack);
   IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack);
   // Number of hunks to be consumed over all warps.
-  IntBytes nHunksAhead = nBytesAhead/BytePerHunk;
+  IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk);
   // Advance collective position.
   nBytesBehind += nHunksAhead*BytePerHunk;
   nBytesAhead -= nHunksAhead*BytePerHunk;
   if (Unroll==1 && BytePerPack <= nBytesAhead) {
     // Only Unroll=1 can do partial hunks (where not all threads partake).
     nHunksAhead += 1;
-    nBytesBehind += nBytesAhead - (nBytesAhead%BytePerPack);
-    nBytesAhead = nBytesAhead%BytePerPack;
+    nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack));
+    nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack);
   }
   nHunksAhead -= warp;
 
@@ -79,8 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks(
     { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
-        // Use volatile loads in case credits are polled for with volatile (instead of acquire).
-        acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
+        if (0 < MultimemSrcs) {
+          // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
+        } else {
+          // Use volatile loads in case credits are polled for with volatile (instead of acquire).
+          acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
+        }
         minSrcs[0] += WARP_SIZE*BytePerPack;
         if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
       }
@@ -92,8 +99,13 @@ __device__ __forceinline__ void reduceCopyPacks(
       RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
-        // Use volatile loads in case credits are polled for with volatile (instead of acquire).
-        tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
+        if (s < MultimemSrcs) {
+          // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
+        } else {
+          // Use volatile loads in case credits are polled for with volatile (instead of acquire).
+          tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
+        }
         minSrcs[s] += WARP_SIZE*BytePerPack;
       }
       #pragma unroll Unroll
@@ -130,7 +142,11 @@ __device__ __forceinline__ void reduceCopyPacks(
     for (int d=0; d < MinDsts; d++) {
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
-        st_global<BytePerPack>(minDsts[d], acc[u]);
+        if (d < MultimemDsts) {
+          multimem_st_global(minDsts[d], acc[u]);
+        } else {
+          st_global<BytePerPack>(minDsts[d], acc[u]);
+        }
         minDsts[d] += WARP_SIZE*BytePerPack;
       }
     }
@@ -167,215 +183,61 @@ __device__ __forceinline__ void reduceCopyPacks(
 }
 
 template<int Unroll, typename RedFn, typename T,
-         int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int PreOpSrcs,
+         int MultimemSrcs, int MinSrcs, int MaxSrcs,
+         int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
          typename IntBytes>
-__device__ __forceinline__ void ReduceOrCopyMulti(
+__device__ __forceinline__ void reduceCopy(
     int thread, int nThreads,
     uint64_t redArg, uint64_t *preOpArgs, bool postOp,
     int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
     IntBytes nElts
   ) {
+  static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
   //int nWarps = nThreads/WARP_SIZE;
   //int warp = thread/WARP_SIZE;
   int lane = thread%WARP_SIZE;
-
-  // Check that all is 16B aligned. If not don't use 16B load/stores.
-  int aligned = 1;
-  if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane])%4;
-  if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane])%4;
-  aligned = !(__any(!aligned));
+  // If a multimem src is present then our biggest pack size is limited to what
+  // is supported for this redfn/type.
+  constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
 
   IntBytes nBytesBehind = 0;
   IntBytes nBytesAhead = nElts*sizeof(T);
-  if (aligned) {
-    reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), /*BytePerPack=*/16,
-      MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
-      (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-       nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
-    if (nBytesAhead == 0) return;
 
-    reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/16,
-      MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
-      (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-       nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
-    if (nBytesAhead == 0) return;
+  #if __cpp_if_constexpr
+  if constexpr (BigPackSize > sizeof(T)) {
+  #else
+  if (BigPackSize > sizeof(T)) {
+  #endif
+    // Check that all pointers are BigPackSize aligned.
+    bool aligned = true;
+    if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
+    if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
+    aligned = !(__any(!aligned));
+    if (aligned) {
+      reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), BigPackSize,
+        MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
+        (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
+         nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+      if (nBytesAhead == 0) return;
+
+      reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
+        MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
+        (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
+         nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+      if (nBytesAhead == 0) return;
+    }
   }
 
   reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
-    MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
+    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
     (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
      nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
   if (nBytesAhead == 0) return;
 
   reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
-    MinSrcs, MaxSrcs, MinDsts, MaxDsts, PreOpSrcs>
+    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
     (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
      nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
 }
 
-// Copies from srcAddr to dstAddr using multimem load/store. The amount copied
-// will be at most Unroll*BytePerPack*WARP_SIZE. If Partial=1, then the amount
-// will be the min() of that and nBytesAhead. If srcAddr is not BytePerPack
-// aligned then the amount copied will be less by (srcAddr%BytePerPack) since
-// we begin loads at the first pack containing the first element.
-template<typename RedFn, typename T, int Unroll, int BytePerPack,
-         bool SrcAligned, // is srcAddr aligned to BytePerPack
-         bool DstAligned, // are dstAddr and nBytesAhead both aligned to BytePerPack
-         bool Partial, // is this a possibly partial hunk
-         typename IntBytes>
-__device__ __forceinline__ void copyMultimemMultimem_WarpUnrolled(
-    int lane, RedFn redFn, bool postOp, uintptr_t srcAddr, uintptr_t dstAddr,
-    IntBytes nBytesAhead, uint32_t scratchAddr
-  ) {
-#if 0
-  int srcMisalign = SrcAligned ? 0 : srcAddr%BytePerPack;
-  srcAddr -= srcMisalign;
-
-  BytePack<BytePerPack> reg[Unroll];
-  int offset = lane*BytePerPack;
-  #pragma unroll Unroll
-  for (int u=0; u < Unroll; u++) {
-    if (!Partial || (offset < srcMisalign + nBytesAhead)) {
-      reg[u] = applyLoadMultimem(redFn, srcAddr+offset);
-      if (postOp) reg[u] = applyPostOp(redFn, reg[u]);
-    }
-    offset += WARP_SIZE*BytePerPack;
-  }
-
-  if (SrcAligned && DstAligned) {
-    offset = lane*BytePerPack;
-    #pragma unroll Unroll
-    for (int u=0; u < Unroll; u++) {
-      if (!Partial || offset < nBytesAhead) {
-        multimem_st_global<BytePerPack>(dstAddr+offset, reg[u]);
-      }
-      offset += WARP_SIZE*BytePerPack;
-    }
-  } else {
-    __syncwarp();
-    offset = lane*BytePerPack;
-    #pragma unroll Unroll
-    for (int u=0; u < Unroll; u++) {
-      if (!Partial || (offset < srcMisalign + nBytesAhead)) {
-        st_shared<BytePerPack>(scratchAddr+offset, reg[u]);
-      }
-      offset += WARP_SIZE*BytePerPack;
-    }
-    __syncwarp();
-    if (!SrcAligned) {
-      // Ignore the beginning of the first pack corresponding to bytes overread
-      // due to misalignment.
-      nBytesAhead = min(nBytesAhead, Unroll*WARP_SIZE*BytePerPack - srcMisalign);
-    }
-    copyGlobalShared_WarpUnrolled
-      <sizeof(T), /*MaxBytes=*/Unroll*WARP_SIZE*BytePerPack, /*Multimem=*/1>
-        (lane, dstAddr, scratchAddr+srcMisalign, nBytesAhead);
-  }
-#endif
-}
-
-// copyMultimemMultimem_IfEnabled has two overloads: the enabled case whose first arg
-// has type `std::true_type` and the disabled case with first arg `std::false_type`.
-// This is to guard the template instantiations of Apply_LoadMultimem on types/ops where
-// they aren't supported. A nicer approach is to use C++17's "if constexpr".
-template<typename RedFn, typename IntBytes>
-__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
-    std::false_type enabled/*=false*/,
-    int thread, int nThreads, uint64_t redArg, bool postOp,
-    void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
-  ) {
-  // nop
-}
-
-template<typename RedFn, typename IntBytes>
-__device__ __forceinline__ void copyMultimemMultimem_IfEnabled(
-    std::true_type enabled/*=true*/,
-    int thread, int nThreads, uint64_t redArg, bool postOp,
-    void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
-  ) {
-  static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
-
-  constexpr int BytePerPack = Apply_LoadMultimem<RedFn>::PackSize;
-  using T = typename RedFn::EltType;
-  constexpr int Unroll = ncclNvlsUnroll(BytePerPack);
-  constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack;
-  int nWarps = nThreads/WARP_SIZE;
-  int warp = thread/WARP_SIZE;
-  int lane = thread%WARP_SIZE;
-  RedFn redFn(redArg);
-
-  uintptr_t srcAddr = cvta_to_global(srcPtr);
-  uintptr_t dstAddr = cvta_to_global(dstPtr);
-  IntBytes warpBytesAhead = nElts*sizeof(T);
-  bool partialHunkIsFront;
-
-  // First handle misalignment of srcAddr.
-  if ((BytePerPack != sizeof(T)) && (srcAddr%BytePerPack != 0)) {
-    // If srcAddr isn't pack aligned then the first hunk processed will be short
-    // the same number of bytes as srcAddr's misalignment.
-    if (warp == 0) {
-      partialHunkIsFront = true;
-      goto PartialHunk; // "call" PartialHunk()
-    PartialHunkFrontReturn:
-      warp = nWarps;
-    }
-    warp -= 1; // Rotate warp numbers for load balancing
-    int advanced = BytePerHunk-(srcAddr%BytePerPack); // since copyMultimemMultimem_WarpUnrolled shorts by the misalignment
-    srcAddr += advanced; // srcAddr is now pack aligned
-    dstAddr += advanced;
-    warpBytesAhead -= advanced;
-  }
-
-  warpBytesAhead -= warp*BytePerHunk;
-  srcAddr += warp*BytePerHunk;
-  dstAddr += warp*BytePerHunk;
-  // Now that srcAddr is pack aligned detect if dstAddr is pack aligned.
-  if ((BytePerPack == sizeof(T)) || (dstAddr%BytePerPack == 0)) {
-    while (BytePerHunk <= warpBytesAhead) {
-      copyMultimemMultimem_WarpUnrolled
-        <RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/true, /*Partial=*/false>
-          (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
-      srcAddr += nWarps*BytePerHunk;
-      dstAddr += nWarps*BytePerHunk;
-      warpBytesAhead -= nWarps*BytePerHunk;
-    }
-  } else {
-    while (BytePerHunk <= warpBytesAhead) {
-      copyMultimemMultimem_WarpUnrolled
-        <RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/true, /*DstAligned=*/false, /*Partial=*/false>
-          (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
-      srcAddr += nWarps*BytePerHunk;
-      dstAddr += nWarps*BytePerHunk;
-      warpBytesAhead -= nWarps*BytePerHunk;
-    }
-  }
-
-  if (0 < warpBytesAhead) {
-    partialHunkIsFront = false;
-    goto PartialHunk; // "call" PartialHunk()
-  PartialHunkBackReturn:;
-  }
-  return;
-
-PartialHunk:
-  // We have to handle a partial hunk possibly at the front and back of the
-  // buffer. We generate the code once here since its a lot of instructions,
-  // and then simulate function calls with gotos.
-  copyMultimemMultimem_WarpUnrolled
-    <RedFn, T, Unroll, BytePerPack, /*SrcAligned=*/false, /*DstAligned=*/false, /*Partial=*/true>
-      (lane, redFn, postOp, srcAddr, dstAddr, warpBytesAhead, warpScratchAddr);
-  if (partialHunkIsFront) goto PartialHunkFrontReturn;
-  goto PartialHunkBackReturn;
-}
-
-template<typename RedFn, typename IntBytes>
-__device__ __forceinline__ void copyMultimemMultimem(
-    int thread, int nThreads, uint64_t redArg, bool postOp,
-    void *srcPtr, void *dstPtr, IntBytes nElts, uint32_t warpScratchAddr
-  ) {
-  constexpr bool Enabled = Apply_LoadMultimem<RedFn>::PackSize != 0;
-  copyMultimemMultimem_IfEnabled<RedFn>(
-    /*enabled=*/std::integral_constant<bool, Enabled>(),
-    thread, nThreads, redArg, postOp, srcPtr, dstPtr, nElts, warpScratchAddr);
-}
 #endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 018e511c0b..85695f059e 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -26,7 +26,8 @@ __shared__ ncclShmemData ncclShmem;
   NCCL_FUNC5(func, RING,    devredop, type, nullify), \
   NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
   NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS,           devredop, type, nullify)
+  NCCL_FUNC5(func, NVLS,           devredop, type, nullify), \
+  NCCL_FUNC5(func, NVLS_TREE,      devredop, type, nullify)
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index 9ebe8eea84..0c40f9afce 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -42,7 +42,7 @@ namespace {
       dst += i0;
       void *vsrc = (void*)src;
       void *vdst = (void*)dst;
-      ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
+      reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
         (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
     }
   }
diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h
index 8ee13fbb6f..5b694545c5 100644
--- a/src/collectives/device/op128.h
+++ b/src/collectives/device/op128.h
@@ -7,6 +7,8 @@
 #ifndef OP128_H_
 #define OP128_H_
 
+#include <type_traits>
+
 inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
   v0 = __builtin_nontemporal_load(ptr);
   v1 = __builtin_nontemporal_load(ptr+1);
@@ -88,6 +90,8 @@ __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
 template<int Size>
 union BytePack;
 template<>
+union BytePack<0> {};
+template<>
 union BytePack<1> {
   uint8_t u8, native;
 };
@@ -130,14 +134,26 @@ union alignas(16) BytePack<16> {
 };
 
 template<typename T>
-__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value)  {
-  union { BytePack<sizeof(T)> p; T v; };
+struct BytePackOf {
+  static constexpr int Size = sizeof(T);
+  using Pack = BytePack<Size>;
+};
+template<>
+struct BytePackOf<BytePack<0>> {
+  static constexpr int Size = 0;
+  using Pack = BytePack<0>;
+};
+
+template<typename T>
+__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value)  {
+  union { typename BytePackOf<T>::Pack p; T v; };
   v = value;
   return p;
 }
+
 template<typename T>
-__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack)  {
-  union { BytePack<sizeof(T)> p; T v; };
+__device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack)  {
+  union { typename BytePackOf<T>::Pack p; T v; };
   p = pack;
   return v;
 }
@@ -152,6 +168,13 @@ template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
 //template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
 
+template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
+//template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
+//template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
+//template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
+
 // Used to define implementations for above prototypes.
 #define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
   template<> \
@@ -255,6 +278,18 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size
 
 #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
 template<>
+__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) {
+  // nop
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
+  asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
+  asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
+}
+template<>
 __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
   asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
 }
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 024df18c64..170a9cad43 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -37,13 +37,14 @@
  * to how that protocol operates with a consistent interface so that our
  * algorithm code can operate protocol parametrically.
  */
-template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
+template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, int MultimemSrcs_1 = 0, int MultimemDsts_1 = 0>
 struct ProtoSimple {
   static constexpr int Id = NCCL_PROTO_SIMPLE;
   static constexpr int SlicePerChunk = SlicePerChunk_1;
   static constexpr int StepPerSlice = StepPerSlice_1;
   static constexpr int Unroll = Unroll_1;
-  static constexpr bool NVLS = NVLS_1;
+  static constexpr int MultimemSrcs = MultimemSrcs_1;
+  static constexpr int MultimemDsts = MultimemDsts_1;
 
   // Data bytes (no flags etc) in one step of the fifo queue.
   __device__ static int calcBytePerStep() {
@@ -55,9 +56,6 @@ struct ProtoSimple {
   }
   // Group width is how many consecutive group values a subchannel occupies.
   static constexpr int MaxGroupWidth = 1;
-  __device__ static int calcGroupWidth(bool send, int nthreads) {
-    return 1;
-  }
 };
 
 struct ProtoLL {
@@ -73,9 +71,6 @@ struct ProtoLL {
   }
   // Group width is how many consecutive group values a subchannel occupies.
   static constexpr int MaxGroupWidth = 1;
-  __device__ static int calcGroupWidth(bool send, int nthreads) {
-    return 1;
-  }
 };
 
 struct ProtoLL128 {
@@ -91,9 +86,6 @@ struct ProtoLL128 {
   }
   // Group width is how many consecutive group values a subchannel occupies.
   static constexpr int MaxGroupWidth = 1;
-  __device__ static int calcGroupWidth(bool send, int nthreads) {
-    return 1;
-  }
 };
 
 /* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template
@@ -133,22 +125,22 @@ class Primitives;
 // Used by LL & LL128 to implement direct members in the naive way.
 template<typename RealPrimitives>
 struct PrimitivesWithoutDirect {
-  __device__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
+  __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->send(inpIx, eltN);
   }
-  __device__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
+  __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
   }
   __device__ void directRecv(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
   }
-  __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
+  __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
   }
-  __device__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
+  __device__ void directRecvCopySend(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
   }
-  __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
+  __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
     static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
   }
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index a0060dae46..faa2b03770 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -566,24 +566,24 @@ private:
  public:
   __device__  Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
     ):
     redOp(redOpArg),
-    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
     stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
-    int connIndex = group >> 16;
     auto *channel = &ncclShmem.channel;
-    barriers = &ncclShmem.groups[this->group].barrier;
-    barrier_next = ncclShmem.groups[this->group].barrier_next;
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
     // If we are going to support oneshot collNet + LL, then we would need to add connector index here
     int nrecv=0, nsend=0;
     // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
     while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 0529464f36..469b843890 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -32,6 +32,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
   const int wid;
   const int stepSize;
   const int warp;
+  const int warpInBlock; // warp index in thread block
   const bool flagThread;
   const int group;
   Fan fan;
@@ -488,23 +489,24 @@ private:
 public:
   __device__ Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, uint64_t redOpArg, int group=0
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
-    flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
+    warpInBlock(threadIdx.x/WARP_SIZE),
+    flagThread((tid%4)==3), group(group),
     stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
-    int connIndex = group >> 16;
     auto *channel = &ncclShmem.channel;
-    barriers = &ncclShmem.groups[this->group].barrier;
-    barrier_next = ncclShmem.groups[this->group].barrier_next;
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[connIndex], nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
       nrecv++;
     }
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->peers[sendPeers[nsend]].send[connIndex], nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index fb5b0e0af9..27c02bf0bd 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -13,9 +13,9 @@
 #include "msccl/msccl_struct.h"
 
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -31,10 +31,9 @@ class Primitives<
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
                        ThreadsSynced = 0x800,
-                       NvlsMinPolling = 0x1000,
-                       NvlsRecv = 0x2000;
+                       NvlsMinPolling = 0x1000;
   const int tid, tidInBlock;
-  int nthreads;
+  const int nthreads;
   int nworkers;
   const int stepSize;
   Fan fan;
@@ -93,19 +92,19 @@ private:
 
   inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
     #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
-    if (NVLS && (flags & NvlsMinPolling)) {
+    if (flags & NvlsMinPolling) {
       uint64_t ans;
       asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
       return ans;
     }
     #endif
-    // volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
+    // volatile is faster than acquire but not as correct. Make sure reduceCopy
     // loads data using volatile so it doesn't see stale data in L1.
     return atomicAdd((unsigned long long *)ptr, 0);
   }
 
   template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
-  __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
+  __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
     const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
     const bool noRecvWait = DirectRecv && Src && (flags & DirectRead);        // no wait when directly reading from remote input
     const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
@@ -132,7 +131,7 @@ private:
         ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
       else if (isSendNotRecv && DirectSend) {
         if (flags & DirectWrite) {
-          ptrs[index] = directBuff + remoteIx + offset;
+          ptrs[index] = directBuff + dstIx + offset;
         } else if (flags & DirectRead) {  // empty send
           ptrs[index] = nullptr;
         } else {
@@ -140,7 +139,7 @@ private:
         }
       } else if (!isSendNotRecv && DirectRecv) {
         if (flags & DirectRead) {
-          ptrs[index] = directBuff + remoteIx + offset;
+          ptrs[index] = directBuff + srcIx + offset;
         } else if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
         } else {
@@ -173,7 +172,7 @@ private:
 
   template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
   __device__ __forceinline__ void genericOp(
-      intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp
+      intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp
     ) {
     constexpr int DirectRecv = 1 && Direct && DirectRecv1;
     constexpr int DirectSend = 1 && Direct && DirectSend1;
@@ -217,17 +216,12 @@ private:
           ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
         if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
           ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
-        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
         subBarrier();
         /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
          * to 0 to avoid unnecessary workload. */
         int workSize = ncclShmem.aborted ? 0 : sliceSize;
-        if (NVLS && ncclShmem.groups[group].nvlsRecv) {
-          void* src = ncclShmem.groups[group].srcs[0];
-          void* dst = ncclShmem.groups[group].dsts[0];
-          copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
-          cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
-        } else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
           if (Send) {
 
@@ -244,7 +238,7 @@ private:
             }
 #endif
 
-            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
+            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
               (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
                1, ncclShmem.groups[group].srcs,
                fan.nsend(), ncclShmem.groups[group].dsts+1,
@@ -280,7 +274,7 @@ private:
           }
 #endif
 
-          ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
+          reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
             (tid, nworkers, ncclShmem.redOpArgs[0],  nullptr, postOp,
              Recv, ncclShmem.groups[group].srcs,
              Dst, ncclShmem.groups[group].dsts,
@@ -316,7 +310,9 @@ private:
 
           constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
                                     DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
-          ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
+          reduceCopy<Unroll, RedOp, T,
+            MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
+            MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
             (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
              Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
              Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
@@ -370,19 +366,19 @@ private:
         srcs[nsrcs] = dsts[0];
         nsrcs++;
         if (MULTISRCS){
-          ReduceOrCopyMulti<Unroll, RedOp, T, 3, MSCCL_MAX_REDUCE_FUSION, 1, 1, 0>
+          reduceCopy<Unroll, RedOp, T, 0, 3, MSCCL_MAX_REDUCE_FUSION, 0, 1, 1, 0>
             (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem);
         } else {
-          ReduceOrCopyMulti<Unroll, RedOp, T, 2, 2, 1, 1, 0>
+          reduceCopy<Unroll, RedOp, T, 0, 2, 2, 0, 1, 1, 0>
             (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem);
         }
       }
       if (COPY){
-        ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
+        reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
           (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem);
         if (MULTISRCS) {
           for (int i = 1; i < nsrcs; i++){
-            ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
+            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
               (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem);
           }
         }
@@ -425,7 +421,7 @@ private:
             void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
             int realPeerSize = min(realSize, totalElem-pOffset);
             if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
-              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
+              reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
               // Mark for threadfence at the end
               fenceNeeded |= true;
             }
@@ -437,18 +433,15 @@ private:
           // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
           waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
           subBarrier();
-          if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
-            // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
-            // Do nothing
-          } else {
-            for (int j=0; j<fan.nrecv(); j++) {
-              int i = (j+shift)%fan.nrecv();
-              pOffset = i*peerOffset;
-              if (skip >= 0 && i >= skip) pOffset += peerElem;
-              void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
-              int realPeerSize = min(realSize, totalElem-pOffset);
-              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
-            }
+          #pragma unroll 1
+          for (int j=0; j<fan.nrecv(); j++) {
+            int i = (j+shift)%fan.nrecv();
+            pOffset = i*peerOffset;
+            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
+            int realPeerSize = min(realSize, totalElem-pOffset);
+            if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
+            if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
           }
         }
       }
@@ -469,14 +462,7 @@ private:
       }
       if (flags & RoleWaitRecv) {
         ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
-        if ((index == 0) && (flags & RoleWaitRecv)) {
-          if (conn->flags & NCCL_NVLS_MIN_POLL) {
-            flags |= NvlsMinPolling;
-            ncclShmem.groups[group].nvlsRecv = 1;
-          } else {
-            ncclShmem.groups[group].nvlsRecv = 0;
-          }
-        }
+        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
         connStepPtr = conn->tail;
         connStepCache = loadStepValue(connStepPtr);
         flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -554,18 +540,16 @@ private:
  public:
   __forceinline__ __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
-      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
     ):
-    tid(tid), tidInBlock(threadIdx.x),
+    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nthreads = nthreads;
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
     this->nworkers = nthreads;
-    this->group = group & (uint16_t)0xFFFF;
-    int connIndex = group >> 16;
-    barriers = &ncclShmem.groups[this->group].barrier;
-    barrier_next = ncclShmem.groups[this->group].barrier_next;
 
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -595,8 +579,8 @@ private:
     if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
     if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
 
-    loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
-    loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
+    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
+    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
 
     setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
@@ -707,62 +691,62 @@ private:
   }
 
   __device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
-    genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, -1, eltN, false);
+    genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false);
   }
   __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) {
-    genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, -1, eltN, false);
+    genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false);
   }
-  __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) {
-    genericOp<0, 1, 0, 1, Input, -1>(inpIx, -1, remoteOutIx, eltN, false);
+  __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false);
   }
-  __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
-    genericOp<0, 1, 0, 1, Output, -1>(outIx, -1, remoteOutIx, eltN, false);
+  __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) {
+    genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false);
   }
 
   __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, postOp);
+    genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
-    genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, /*postOp=*/false);
+    genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
   }
 
   __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
+    genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
-    genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
+  __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
 
   __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
+    genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp);
   }
   __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
+    genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) {
-    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false);
+  __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
+    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
   }
-  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
-    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp);
+  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
 
   __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp);
+    genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
   }
 
   __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp);
+    genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) {
-    genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp);
+  __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
   }
 
   __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp);
+    genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) {
+  __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
-    genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
+    genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
 
   __device__ __forceinline__ void
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 790eca0efa..491258ea62 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -31,7 +31,7 @@ namespace {
     const int root = args->root;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
 
     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
       int realChunkSize;
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
index ab490450ef..b5336570f4 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@@ -56,9 +56,14 @@ struct Apply_PostOp/*{
   static BytePack<EltPerPack*sizeof(T)> postOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
 }*/;
 template<typename Fn>
+struct LoadMultimem_BigPackSize/*{
+  // If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem,
+  // otherwise there are no valid pack sizes for LoadMultimem.
+  static constexpr int BigPackSize = 0;
+}*/;
+template<typename Fn, int BytePerPack>
 struct Apply_LoadMultimem/*{
-  static constexpr int PackSize; // 0 if not implemented
-  static BytePack<PackSize> load(Fn fn, uintptr_t addr);
+  static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
 }*/;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -70,7 +75,7 @@ struct Apply_LoadMultimem/*{
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
   return fromPack<Pack>(
-    Apply_Reduce<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
+    Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::reduce(fn, toPack(a), toPack(b))
   );
 }
@@ -78,7 +83,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
   return fromPack<Pack>(
-    Apply_PreOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
+    Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::preOp(fn, toPack(a))
   );
 }
@@ -86,19 +91,27 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
   return fromPack<Pack>(
-    Apply_PostOp<Fn, sizeof(Pack)/sizeof(typename Fn::EltType)>
+    Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::postOp(fn, toPack(a))
   );
 }
 
-template<typename Fn>
-__device__ __forceinline__ BytePack<Apply_LoadMultimem<Fn>::PackSize> applyLoadMultimem(Fn fn, uintptr_t addr) {
-  return Apply_LoadMultimem<Fn>::load(fn, addr);
+template<typename Fn, int BytePerPack>
+__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
+  return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Apply_Reduce
 
+// Nonsensical base case
+template<typename Fn>
+struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
+  __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
+    return  {};
+  }
+};
+
 // General recursive definition (EltPerPack > 1). This is how we iterate over
 // all elements in a pack of any size, by breaking it into halves. Eventually
 // we'll hit a base case (a more specific template specialization which takes
@@ -283,6 +296,14 @@ struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
     return a;
   }
 };
+// Base case definition (EltPerPack == 0), is nonsense!
+template<typename Fn>
+struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
+  static constexpr bool IsIdentity = true;
+  __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
+    return {};
+  }
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 // Apply_PostOp
@@ -316,6 +337,14 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
     return a;
   }
 };
+// Base case definition (EltPerPack == 0), is nonsense!
+template<typename Fn>
+struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
+  static constexpr bool IsIdentity = true;
+  __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
+    return {};
+  }
+};
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -506,11 +535,6 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // Apply_LoadMultimem
 
-template<typename Fn>
-struct Apply_LoadMultimem {
-  static constexpr int PackSize = 0; // Indicates not implemented
-};
-
 #define SIZEOF_BytePack_field_u16 2
 #define PTX_REG_BytePack_field_u16 "h"
 
@@ -522,11 +546,11 @@ struct Apply_LoadMultimem {
 
 #define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
   template<> \
-  struct Apply_LoadMultimem<Fn<T>> { \
-    static constexpr int PackSize = 1*(SIZEOF_BytePack_field_##pack_field); \
+  struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
+    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
     __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.global." #op "." #ptx_ty " %0, [%1];" \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
         : "l"(addr)); \
       return ans; \
@@ -534,11 +558,11 @@ struct Apply_LoadMultimem {
   };
 #define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
   template<> \
-  struct Apply_LoadMultimem<Fn<T>> { \
+  struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
     static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
     __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@@ -547,8 +571,45 @@ struct Apply_LoadMultimem {
       return ans; \
     } \
   };
+#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
+      BytePack<2*sizeof(T)> tmp; \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+        : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+        : "l"(addr & -uintptr_t(sizeof(T)))); \
+      return tmp.half[(addr/sizeof(T))%2]; \
+    } \
+  };
+
+template<typename Fn, int BytePerPack>
+struct Apply_LoadMultimem {
+  __device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
+    //__trap();
+    return {};
+  }
+};
 
 #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+  template<typename Fn>
+  struct LoadMultimem_BigPackSize {
+    using T = typename Fn::EltType;
+    static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
+                                  std::is_same<Fn, FuncPreMulSum<T>>::value ||
+                                  std::is_same<Fn, FuncSumPostDiv<T>>::value;
+    static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
+                                       std::is_same<Fn, FuncMax<T>>::value;
+    static constexpr bool IsFloat = IsFloatingPoint<T>::value;
+    static constexpr int BigPackSize =
+      IsFloat && IsSum && sizeof(T) < 8 ? 16 :
+      IsFloat && IsSum ? 8 :
+      IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
+      !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
+      /*multimem.ld_reduce not supported:*/ 0;
+  };
+
   DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
   DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
   DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
@@ -565,23 +626,30 @@ struct Apply_LoadMultimem {
   DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
   DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
 
+  DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
   DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
 
   DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
 
-  DEFINE_Apply_LoadMultimem_v4(FuncSum, half, add, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4(FuncMin, half, min, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4(FuncMax, half, max, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
 
   #if defined(__CUDA_BF16_TYPES_EXIST__)
-    DEFINE_Apply_LoadMultimem_v4(FuncSum, __nv_bfloat16, add, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4(FuncMin, __nv_bfloat16, min, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4(FuncMax, __nv_bfloat16, max, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
   #endif
+#else
+  template<typename Fn>
+  struct LoadMultimem_BigPackSize {
+    static constexpr int BigPackSize = 0;
+  };
 #endif
 
 #undef DEFINE_Apply_LoadMultimem
 #undef DEFINE_Apply_LoadMultimem_v4
+#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
 #undef SIZEOF_BytePack_field_u64
 #undef PTX_REG_BytePack_field_u64
 #undef SIZEOF_BytePack_field_u32
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index cf4278485c..6a7caeba37 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -30,7 +30,7 @@ namespace {
     const ssize_t size = args->count;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -113,19 +113,19 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
 
     if (tid < tidEndScatter) {
       // Scatter
-      int group = (0*Proto::MaxGroupWidth) | (0<<16);
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
+        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
         prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
       }
     } else if (tid < tidEndReduce) {
-      int group = (3*Proto::MaxGroupWidth) | (1<<16);
-      // Reduce through MC
+      // Reduce through NVLS
       Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
+        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
+           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*chunkSize;
         int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index abb71b7a6d..030b1af7c4 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -15,7 +15,7 @@
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   template<typename Proto>
-  __device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+  __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
     void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
     ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
 
@@ -58,9 +58,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
         }
 #endif
 
-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
+        reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
           (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
-
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
         if (isNpKitThread) {
           NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -80,7 +79,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
       if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
 
 #if defined(ENABLE_NPKIT)
       if (isNpKitThread) {
@@ -114,7 +113,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   }
 
   template<typename Proto>
-  __device__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+  __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
 #if defined(ENABLE_NPKIT)
     bool isNpKitThread = (tid == 0);
     int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
@@ -142,7 +141,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
       if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
 
 #if defined(ENABLE_NPKIT)
       if (isNpKitThread) {
@@ -189,11 +188,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
     // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
     // So we mirror wid then mirror again the group.
     #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-    int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
+    uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
     args += group;
     tid -= args->warpStart * WARP_SIZE;
     int nthreads = args->nWarps * WARP_SIZE;
-    group |= (args->connIndex<<16); // Used to select connIndex 1
 
     if (args->p2pType == ncclWorkP2pTypeUnused) return;
     if (tid >= nthreads || args->peer == -1) return;
diff --git a/src/debug.cc b/src/debug.cc
index 560c1d26a0..b88fa5982a 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -74,6 +74,8 @@ void ncclDebugInit() {
         mask = NCCL_ALLOC;
       } else if (strcasecmp(subsys, "CALL") == 0) {
         mask = NCCL_CALL;
+      } else if (strcasecmp(subsys, "PROXY") == 0) {
+        mask = NCCL_PROXY;
       } else if (strcasecmp(subsys, "NVLS") == 0) {
         mask = NCCL_NVLS;
       } else if (strcasecmp(subsys, "ALL") == 0) {
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 61bc8169f4..6f3318e7e6 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -167,12 +167,13 @@ static void finishWork(struct ncclWork* work, int WarpSize) {
 
 static void appendWorkElemP2p(
     struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
-    struct ncclWorkElemP2p const *elem
+    struct ncclWorkElemP2p const *elem, bool fuseOk
   ) {
   constexpr int funcIndex = FUNC_INDEX_P2P;
   struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
   struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
   if (q && funcIndex == q->work.header.funcIndex) {
+    if (!fuseOk) goto NewWork;
     if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
       for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
         // Can't have multiple elements of the same ncclWork communicate with the
@@ -301,7 +302,7 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
 // ensure *nWorkBudget >= 1 upon entry.
 static ncclResult_t addP2pToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
-    bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex
+    bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex, bool fuseOk
   ) {
   struct ncclInfo info = {
     isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
@@ -316,7 +317,7 @@ static ncclResult_t addP2pToPlan(
 
   // 1 is connIndex
   struct ncclConnInfo* conn = isSendNotRecv ?
-    &comm->channels[channelId].peers[peer].send[1].conn : &comm->channels[channelId].peers[peer].recv[1].conn;
+    &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn;
   info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
 
   struct ncclProxyOp proxyOp = {};
@@ -337,7 +338,7 @@ static ncclResult_t addP2pToPlan(
   elem.connIndex = connIndex;
 
   *nWorkBudget += plan->channels[channelId].nWork;
-  appendWorkElemP2p(comm, plan, channelId, &elem);
+  appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk);
   *nWorkBudget -= plan->channels[channelId].nWork;
 
   // Calculate the opCount after appendWorkElemP2p since it will always return
@@ -508,7 +509,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       info.sliceSteps = head->sliceSteps;
       NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
       if (nAggOps > 1) {
-        int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
+        int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
         info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
         info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
         info.algorithm = aggInfo.algorithm;
@@ -533,7 +534,7 @@ static ncclResult_t scheduleCollTasksToPlan(
         NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
       }
 
-      int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
+      int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
       NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
         maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
       tasks->nTasksColl -= 1;
@@ -584,17 +585,22 @@ static ncclResult_t scheduleP2pTasksToPlan(
   // Try to use all channels
   int nChannelsMax = comm->p2pnChannelsPerPeer;
   int nChannelsMin = nChannelsMax;
-  // Try to use all channels, but one channel per operation.
-  while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
-  // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
-  while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+  if (comm->nNodes == 1) {
+    // Try to use all channels, but one channel per operation.
+    while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
+    // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
+    while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+  }
 
+  bool fuseOk;
+  // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries.
   while (tasks->nTasksP2p != 0) {
-    for (int i=0; i < nRanks; i++) {
+    for (int i=0; i < tasks->p2pOrderSteps; i++) {
       int sendPeer = sendOrder[i];
       int recvPeer = recvOrder[i];
-      struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendPeer].sendQueue);
-      struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvPeer].recvQueue);
+      if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false;
+      struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL;
+      struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL;
       if (sendPeer == comm->rank) {
         if (recvPeer != comm->rank) {
           WARN("Sendrecv plan not aligned for self");
@@ -639,7 +645,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
           if (recvChunkBytes != 0) {
             if (recvChunkBytes == -1) recvChunkBytes = 0;
             if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
-            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx));
+            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx, fuseOk));
+            fuseOk = true;
             recvPtr += recvChunkBytes;
             recvBytes -= recvChunkBytes;
             recv->chunk += 1;
@@ -652,7 +659,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
           if (sendChunkBytes != 0) {
             if (sendChunkBytes == -1) sendChunkBytes = 0;
             if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
-            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx));
+            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx, fuseOk));
+            fuseOk = true;
             sendPtr += sendChunkBytes;
             sendBytes -= sendChunkBytes;
             send->chunk += 1;
@@ -785,12 +793,12 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
 }
 
 static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  uint64_t collOpCount = comm->collOpCount;
+  uint64_t collOpCount = comm->sharedRes->collOpCount;
   // Advance comm's collOpCount by number of colls in this plan.
-  comm->collOpCount = collOpCount + plan->collOpCount;
+  comm->sharedRes->collOpCount += plan->collOpCount;
   for (int c=0; c < plan->channelUbound; c++) {
     struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
-    uint64_t p2pOpCount = comm->channels[c].p2pOpCount;
+    uint64_t p2pOpCount = comm->sharedRes->p2pOpCount[c];
     uint64_t nextP2pOpCount = p2pOpCount;
     while (q != nullptr) {
       struct ncclProxyOp* qNext = q->enqNext;
@@ -813,7 +821,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
       q = qNext;
     }
     // Advance channel's p2pOpCount by number of p2p's in this plan channel.
-    comm->channels[c].p2pOpCount = nextP2pOpCount;
+    comm->sharedRes->p2pOpCount[c] = nextP2pOpCount;
   }
   return ncclSuccess;
 }
@@ -932,15 +940,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
     // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
     // at least one of the two streams to be strong-stream.
     cudaStream_t launchStream = tasks->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->deviceStream), result, failure);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
 
     if (tasks->numStreams != 1) {
       // Create dependency for device stream on user streams. First from extra user
       // streams to deviceStream. Then deviceStream to first user stream.
       for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, l->stream), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
       }
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->deviceStream), result, failure);
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
     } else if (tasks->streams->stream != comm->lastStream && comm->lastStream != nullptr) {
       // Stream changed from last call, create dependency against last NCCL kernel launch
       CUDACHECK(hipStreamWaitEvent(tasks->streams->stream, comm->doneEvent, 0));
@@ -954,15 +962,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
         if (plan->hasProxyOps) {
           if (!acquired) {
             acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
           }
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->hostStream, hostStreamPlanCallback, plan), result, failure);
+          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
       if (acquired) {
         // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->hostStream), result, failure);
+        if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
       }
     }
 
@@ -1011,7 +1019,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
   if (driverVersion >= 11080) {
     int compCap = comm->compCap;
-    unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
+    unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
 
     cudaLaunchConfig_t launchConfig = {0};
     cudaLaunchAttribute launchAttrs[3];
@@ -1083,7 +1091,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     // Create dependency for deviceStream on launchStream. We know that deviceStream
     // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
     // so we can say that launchStream subsumes it.
-    if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
+    if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
   resume1:
     // Create dependency for other user streams (skip launch stream) on deviceStream.
     // Again, the user streams haven't been touched since deviceStream waited on them
@@ -1091,13 +1099,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     struct ncclCudaStreamList* sl = tasks->streams->next;
     tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
     while (sl != nullptr && tasks->numStreams != 1) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
     resume2:
       sl = sl->next;
     }
     tasks->numStreams = 0;
     // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->deviceStream), result, resume3);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
   resume3:;
   }
   return result;
@@ -1108,13 +1116,9 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
 /*****************************************************************************/
 
 static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
-  if (info->comm->collNetSupport > 0) {
-    // Translate ncclAvg and PreMulSum
-    ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
-    NCCLCHECK(collNetReduceSupport(info->comm, info->datatype, netOp, collNetTypeSupport));
-  } else {
-    *collNetTypeSupport = 0;
-  }
+  // Translate ncclAvg and PreMulSum
+  ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
+  *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype];
   return ncclSuccess;
 }
 
@@ -1134,6 +1138,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
     for (int a=0; a<nAlgos; a++) {
       if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
       if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
+      if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
+      if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         float time;
@@ -1167,7 +1173,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
       }
       ncSwitch /= 2;
     }
-  } else if (info->algorithm == NCCL_ALGO_NVLS) {
+  } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
     // NVLS should not need more than 16 channels to get peak BW.
     nc = comm->nvlsChannels;
   } else {
@@ -1185,12 +1191,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
   if (info->protocol == NCCL_PROTO_SIMPLE) {
-    nt += WARP_SIZE; // Extra warp for sync
+    if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
     // More threads or sync warps needed due to split thread model
-    if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
-    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
-    if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
-    if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
+    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
   }
   nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
 #endif
@@ -1234,11 +1237,15 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
       info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
     case ncclFuncReduceScatter:
     case ncclFuncAllGather:
+      info->pattern =
+        info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
+        ncclPatternRing; break;
     case ncclFuncAllToAllPivot:
       info->pattern = ncclPatternRing; break;
     case ncclFuncAllReduce:
       info->pattern =
         info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
+        info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
         info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
         info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
         info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@@ -1258,14 +1265,17 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
     case ncclPatternPipelineFrom:
     case ncclPatternPipelineTo:
     case ncclPatternCollnetChain:
+      info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
     case ncclPatternNvls:
-      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+      info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
     case ncclPatternCollnetDirect:
       info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
     case ncclPatternRing:
       info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
     case ncclPatternRingTwice:
       info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternNvlsTree:
+      info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break;
     default:
       WARN("Unknown pattern %d", info->pattern);
       return ncclInternalError;
@@ -1348,13 +1358,22 @@ comp_next:
     while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
     work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_NVLS) {
-    if (chunkSize > 131072) chunkSize = 131072;
+    int maxChunkSize = 131072;
+    if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
     uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
-    if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
+    if ((info->nBytes < (64 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
     if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
     if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
     work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
+    uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
+    if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
+    if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
+    if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
+    if ((info->nBytes < (1 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -1383,8 +1402,7 @@ comp_next:
   proxyOp->chunkSize = chunkSize;
   proxyOp->protocol = info->protocol;
   proxyOp->dtype = info->datatype;
-  proxyOp->redOp = (info->algorithm != NCCL_ALGO_COLLNET_DIRECT && info->algorithm != NCCL_ALGO_COLLNET_CHAIN) ? ncclNumOps : // Only set redOp when using CollNet
-                     info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
+  proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
                      info->op;
   proxyOp->pattern = info->pattern;
   proxyOp->root = info->root;
@@ -1502,20 +1520,20 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
           int channelId;
           NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
           if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
               comm->connectSend[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+            if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
               comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
           } else {
-            if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
               comm->connectRecv[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+            if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
               comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
@@ -1550,7 +1568,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
       t->chunkSteps = info->chunkSteps;
       t->sliceSteps = info->sliceSteps;
       ncclIntruQueueEnqueue(&tasks->collQueue, t);
-      tasks->collBytesTotal += t->count*ncclTypeSize(t->datatype);
+      tasks->collBytesTotal += info->nBytes;
       tasks->nTasksColl += 1;
     }
   }
@@ -1611,10 +1629,10 @@ exit:
   NCCLCHECK(ncclGroupEndInternal());
   /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
    * so we have to check state here. */
-  if (info->comm && !info->comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
+  if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
   return ret;
 fail:
-  if (info->comm && !info->comm->blocking) (void) ncclCommSetAsyncError(info->comm, ret);
+  if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
   goto exit;
 }
 
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 617ea78e5f..82a51e77ad 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -15,15 +15,10 @@
 /********************* Internode connection ***********************/
 /******************************************************************/
 
-ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
-    struct ncclTopoRanks* topoRanks) {
+ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
+  int localRanks = comm->topo->nodes[GPU].count;
   int nChannels = comm->nChannels;
-  int localRanks = 0;
-  for (int i=0; i<comm->topo->nodes[GPU].count; i++) {
-    localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu;
-  }
 
   for (int c=0; c<nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
@@ -39,9 +34,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
 
-    int* ringIntra = ringGraph->intra+c*localRanks;
-    int* treeIntra = treeGraph->intra+c*localRanks;
-    int* collNetIntra = collNetGraph->intra+c*localRanks;
+    int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
+    int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
+    int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
+    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;
 
     for (int i=0; i<localRanks; i++) {
       if (ringIntra[i] == rank) {
@@ -52,8 +48,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
       }
       if (treeIntra[i] == rank) {
         int parentIndex = 0;
-        int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
-        int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
+        int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+        int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
 
         topoRanks->treeToParent[c] = treeIntra[parentIndex];
         topoRanks->treeToChild0[c] = treeIntra[child0Index];
@@ -68,6 +64,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     }
     topoRanks->ringPrev[c] = channel->ring.prev;
     topoRanks->ringNext[c] = channel->ring.next;
+    topoRanks->nvlsHeads[c] = nvlsIntra[0];
   }
   // Duplicate channels rings/trees
   struct ncclChannel* channel0 = comm->channels;
@@ -79,10 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
 ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
     struct ncclTopoGraph* treeGraph) {
   int nChannels = comm->nChannels;
-  int localRanks = 0;
-  for (int i=0; i<comm->topo->nodes[GPU].count; i++) {
-    localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu;
-  }
+  int localRanks = comm->topo->nodes[GPU].count;
   //new tree
   for (int c=0; c<nChannels; c++) {
     int* treeIntra = treeGraph->intra+c%3*localRanks;
@@ -120,26 +114,26 @@ ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
   return ncclSuccess;
 }
 
-static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
+static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
   int nChannels = comm->nChannels;
   int nNodes = comm->nNodes;
   for (int c=0; c<nChannels; c++) {
-    int* recv = ringRecv+c*comm->nRanks;
-    int* send = ringSend+c*comm->nRanks;
+    int* recv = ringRecv+c*comm->nNodes;
+    int* send = ringSend+c*comm->nNodes;
     int* prev = ringPrev+c*comm->nRanks;
     int* next = ringNext+c*comm->nRanks;
     struct ncclChannel* channel0 = comm->channels+c;
     struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
     for (int n=0; n<nNodes; n++) {
-      int recvRank = recv[firstRanks[n]];
-      int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
+      int recvRank = recv[n];
+      int prevSendRank = send[(n-1+nNodes)%nNodes];
       prev[recvRank] = prevSendRank;
       if (comm->rank == recvRank) {
         channel0->ring.prev = prevSendRank;
         if (channel1) channel1->ring.prev = prevSendRank;
       }
-      int sendRank = send[firstRanks[n]];
-      int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
+      int sendRank = send[n];
+      int nextRecvRank = recv[(n+1)%nNodes];
       next[sendRank] = nextRecvRank;
       if (comm->rank == sendRank) {
         channel0->ring.next = nextRecvRank;
@@ -152,8 +146,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
   return ncclSuccess;
 }
 
-static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
- for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
+static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
+ for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
  return ncclSuccess;
 }
 
@@ -175,42 +169,38 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
   return ncclSuccess;
 }
 
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
   const int nChannels = (comm->nChannels > MAXCHANNELS/2) ? comm->nChannels/2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-  int* ranksToParent, *ranksToChild0, *ranksToChild1;
-  NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
-  NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
-  NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
 
   // Compute tree depth. Not an exact value but a good approximation in most
   // cases
   int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
 
   int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+  int* ttp, *ttc0, *ttc1;
   NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
-
   if (comm->nChannels <= MAXCHANNELS/2) {
     for (int c=0; c<nChannels; c++) {
        struct ncclChannel* channel0 = comm->channels+c;
        struct ncclChannel* channel1 = channel0+nChannels;
-       NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
-       if (comm->rank == ranksToParent[node]) {
-         NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
-         NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
+       ttp = treeToParent+c*comm->nNodes;
+       ttc0 = treeToChild0+c*comm->nNodes;
+       ttc1 = treeToChild1+c*comm->nNodes;
+       if (comm->rank == ttp[node]) {
+         NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
+         NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
        }
-       if (comm->rank == ranksToChild0[node]) {
-         NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
-         NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
+       if (comm->rank == ttc0[node]) {
+         NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
+         NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
        }
-       if (comm->rank == ranksToChild1[node]) {
-         NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
-         NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
+       if (comm->rank == ttc1[node]) {
+         NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
+         NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
        }
-       if (comm->rank == ranksToParent[node] ||
-           comm->rank == ranksToChild0[node] ||
-           comm->rank == ranksToChild1[node]) {
+       if (comm->rank == ttp[node] ||
+           comm->rank == ttc0[node] ||
+           comm->rank == ttc1[node]) {
          INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
          INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
        }
@@ -219,64 +209,63 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
   } else {
     for (int c=0; c<nChannels; c++) {
        struct ncclChannel* channel0 = comm->channels+c;
-       NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
-       if (comm->rank == ranksToParent[node]) {
-         NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
+       ttp = treeToParent+c*comm->nNodes;
+       ttc0 = treeToChild0+c*comm->nNodes;
+       ttc1 = treeToChild1+c*comm->nNodes;
+       if (comm->rank == ttp[node]) {
+         NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
        }
-       if (comm->rank == ranksToChild0[node]) {
-         NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
+       if (comm->rank == ttc0[node]) {
+         NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
        }
-       if (comm->rank == ranksToChild1[node]) {
-         NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
+       if (comm->rank == ttc1[node]) {
+         NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
        }
-       if (comm->rank == ranksToParent[node] ||
-           comm->rank == ranksToChild0[node] ||
-           comm->rank == ranksToChild1[node]) {
-         INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+       if (comm->rank == ttp[node] ||
+           comm->rank == ttc0[node] ||
+           comm->rank == ttc1[node]) {
+         INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
        }
        channel0->tree.depth = depth;
     }
     for (int c=nChannels; c<nChannels*2; c++) {
        struct ncclChannel* channel1 = comm->channels+c;
-       NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
-       NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
-       if (comm->rank == ranksToParent[node]) {
-         NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
+       ttp = treeToParent+c*comm->nNodes;
+       ttc0 = treeToChild0+c*comm->nNodes;
+       ttc1 = treeToChild1+c*comm->nNodes;
+       if (comm->rank == ttp[node]) {
+         NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
        }
-       if (comm->rank == ranksToChild0[node]) {
-         NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
+       if (comm->rank == ttc0[node]) {
+         NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
        }
-       if (comm->rank == ranksToChild1[node]) {
-         NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
+       if (comm->rank == ttc1[node]) {
+         NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
        }
-       if (comm->rank == ranksToParent[node] ||
-           comm->rank == ranksToChild0[node] ||
-           comm->rank == ranksToChild1[node]) {
-         INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+       if (comm->rank == ttp[node] ||
+           comm->rank == ttc0[node] ||
+           comm->rank == ttc1[node]) {
+         INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
        }
        channel1->tree.depth = depth;
     }
   }
-  free(ranksToParent);
-  free(ranksToChild0);
-  free(ranksToChild1);
   return ncclSuccess;
 }
 
 static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
   int rank = comm->rank;
   int localRanks = comm->localRanks;
-  int nHeads = collNetGraph->nChannels;
+  int nHeads = 0;
   int *heads;
-  NCCLCHECK(ncclCalloc(&heads, nHeads));
+  NCCLCHECK(ncclCalloc(&heads, localRanks));
   // Find all head ranks
   // Head index is always 0
-  for (int c=0; c<nHeads; c++) {
+  for (int c=0; c<collNetGraph->nChannels; c++) {
     int* collNetIntra = collNetGraph->intra+c*localRanks;
-    heads[c] = collNetIntra[0];
+    int head = collNetIntra[0];
+    for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
+    if (head != -1) heads[nHeads++] = collNetIntra[0];
   }
   // For all channels
   for (int c=0; c<comm->nChannels; c++) {
@@ -315,10 +304,96 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     INFO(NCCL_GRAPH, "%s", line);
     channel->collnetChain.depth = comm->nRanks/comm->nNodes;
   }
+  for (int c=0; c<comm->nvlsChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
+  }
   free(heads);
   return ncclSuccess;
 }
 
+static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
+  int nHeads = nvlsGraph->nChannels;
+  int headRank = -1;
+  for (int h=0; h<nHeads; h++) {
+    if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
+  }
+
+  if (nHeads == 0) {
+    comm->nvlsChannels = 0;
+    return ncclSuccess;
+  }
+
+  for (int c=0; c<comm->nvlsChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    channel->nvls.nHeads = nHeads;
+    for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
+    for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
+    channel->nvls.down = comm->nRanks+1+headRank;
+    channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
+    channel->nvls.headRank = headRank;
+    channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
+    channel->nvls.node = comm->node;
+    channel->nvls.nNodes = comm->nNodes;
+  }
+  if (comm->nNodes == 1) return ncclSuccess;
+
+  // Connect Trees
+  int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
+  int pc0, pc1; // ignored
+  NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
+        &tree0Parent, &tree0Child0, &tree0Child1, &pc0,
+        &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
+
+  int* heads = NULL;
+  int treeUp[2] = { -1, -1 };
+  int treeDown0[2] = { -1, -1 };
+  int treeDown1[2] = { -1, -1 };
+
+  if (comm->node == 0) {
+    for (int h=0; h<nHeads; h++) {
+      char line[1024];
+      sprintf(line, "NVLS Head %2d:", h);
+      heads = nvlsHeads+h*comm->nNodes;
+      for (int n=0; n<comm->nNodes && n<20; n++) {
+        sprintf(line+strlen(line), " %2d", heads[n]);
+      }
+      INFO(NCCL_INIT, "%s", line);
+    }
+  }
+
+  // Find the heads where I'm the head rank and retain tree up/down
+  for (int h=0; h<nHeads; h++) {
+    heads = nvlsHeads+h*comm->nNodes;
+    if (heads[comm->node] == comm->rank) {
+      treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
+      treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
+      treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
+      treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
+      treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
+      treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
+      break;
+    }
+  }
+  // Set prev/next in all channels (NVLS compute channels work
+  // orthogonally to NVLS search channels).
+  for (int c=0; c<comm->nvlsChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    channel->nvls.treeUp = treeUp[c%2];
+    channel->nvls.treeDown[0] = channel->nvls.down;
+    int ix = 1;
+    if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
+    if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
+  }
+
+  struct ncclNvls* nvls0 = &comm->channels[0].nvls;
+  struct ncclNvls* nvls1 = &comm->channels[1].nvls;
+  INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
+      nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp,
+      nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp);
+  return ncclSuccess;
+}
+
 // Legacy naming
 NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
 NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
@@ -360,33 +435,40 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
   return c;
 }
 
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc) {
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) {
   // Gather data from all ranks
-  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
+  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
   int nranks = comm->nRanks;
+  int nNodes = comm->nNodes;
   int nChannels = comm->nChannels;
-  NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
-  for (int i=0; i<nranks; i++) {
-    for (int c=0; c<nChannels;c++) {
-      ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
-      ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
-      ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
-      ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
-      treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
-      treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
-      treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
+  NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
+  for (int c=0; c<nChannels;c++) {
+    for (int n=0; n<nNodes; n++) {
+      int r = firstRanks[n];
+      ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
+      ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
+      treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
+      treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
+      treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
+      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
+    }
+    for (int r=0; r<nranks; r++) {
+      ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
+      ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
     }
   }
 
   // Connect rings and trees. This should also duplicate the channels.
-  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
-  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
+  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
+  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
+  NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));
 
   // Duplicate ringPrev/ringNext for ncclBuildRing
   if (nChannels <= MAXCHANNELS/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -400,6 +482,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
 
   // Setup CollNet
   if (comm->collNetSupport == 1) {
+    struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
     if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
@@ -408,10 +491,21 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     NCCLCHECK(connectCollNet(comm, collNetGraph));
   }
 
+  // Use 4 compute channels per search channel to reach peak BW on <8 PPN
+  if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) {
+     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+  }
+
   // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
   // We permit combining max, then min, to only use the first channels, then duplicate them.
-  nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
-  nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(nc, ncclMinNchannels()), ringPrev, ringNext);
+  if (comm->sharedRes->owner != comm) {
+    /* child comm #channels cannot exceed top parent #channels. */
+    nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
+    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
+  } else {
+    nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
+    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
+  }
 
   // Create rings array and check all is fine
   NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
@@ -423,6 +517,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   free(treeToParent);
   free(treeToChild0);
   free(treeToChild1);
+  free(nvlsHeads);
 
   return ncclSuccess;
 }
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 8fc65f9748..cb2bf81b31 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -273,7 +273,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
     if (intermediateNode->type == GPU) {
       intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
-      if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank[0];
+      if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
     }
   }
 
@@ -409,7 +409,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   if (distance == PATH_PXN) {
     // In case of PXN, use the intermediate GPU distance instead
     int proxyRank, g;
-    NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank[0], netDev, &proxyRank));
+    NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
     NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
     struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
     distance = proxyGpu->paths[NET][n].type;
@@ -489,7 +489,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
       WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
       return ncclInternalError;
     }
-    *intermediateRank = node->gpu.rank[0];
+    *intermediateRank = node->gpu.rank;
   } else {
     *intermediateRank = rank;
   }
@@ -563,6 +563,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
   }
 
+  // Set direct paths to NVSwitches.
+  for (int n=0; n<system->nodes[NVS].count; n++) {
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system));
+  }
+
   // Update path for GPUs when we don't want to / can't use GPU Direct P2P
   for (int g=0; g<system->nodes[GPU].count; g++) {
     for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -578,10 +583,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
 
     if (comm == NULL) continue;
     // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
-    struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
+    struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
     for (int p=0; p<system->nodes[GPU].count; p++) {
       if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
+      struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
       int p2p;
       NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
       if (p2p == 0) {
@@ -589,7 +594,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
         NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
         if (shm == 0) {
           // Mark this peer as inaccessible. We'll trim it later.
-          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+          system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
         }
       }
     }
@@ -603,32 +608,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       if (ncclPxnDisable(comm) != 1) {
-        int pxnGpu = -1;
-
-        for (int p=0; p<system->nodes[GPU].count; p++) {
-          if (p == g) continue;
-
+        int localGpuIndex;
+        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
+        if (localGpuIndex != g && localGpuIndex != -1) {
           // PXN = PCI + NVLink.
-          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
+          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
           // Only use PXN for NIC n if remote GPU p ...
-          if (peerNode->paths[NET][n].type > PATH_PXB || // Is connected to the NIC through PCI
-              peerNode->paths[GPU][g].type > PATH_NVL || // Is connected to us through NVLink
-              (peerNode->paths[NET][n].bw <= gpu->paths[NET][n].bw && // Has either higher BW to that NIC
-               gpu->paths[NET][n].type <= PATH_PXB))                        //     or avoids going through a CPU
-            continue;
-
-          pxnGpu = p;
-
-          int netDev;
-          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev));
-          // To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
-          if (netDev == netNode->id) break;
-        }
-        if (pxnGpu != -1) {
+          if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
+              peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
+              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
+               gpu->paths[NET][n].type > PATH_PXB))                  // or avoids going through a CPU
           // We can use that GPU as relay to communicate with that NIC.
           // Only enabling it in the GPU->NIC direction for now to favor
           // receiving locally and sending remotely (consistent with net.cc)
-          NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
+          NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
         }
       }
       // Update path when we dont want to / can't use GPU Direct RDMA.
@@ -659,16 +652,11 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     domains[g] = g;
     ids[g] = gpu->id;
     for (int p=0; p<g; p++) {
-      if (gpu->paths[GPU][p].count > 0) {
+      if (gpu->paths[GPU][p].type < PATH_NET) {
         domains[g] = std::min(domains[g], domains[p]);
       }
     }
-    for (int j=0; j<gpu->gpu.nRanksPerGpu; j++ ) {
-      if (gpu->gpu.rank[j] == comm->rank) {
-        myDomain = domains[g];
-        break;
-      }
-    }
+    if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
   }
 
   int ngpus = system->nodes[GPU].count;
@@ -732,7 +720,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
     for (int g = 0; g < system->nodes[GPU].count; g++) {
       int net;
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
+      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
       NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
       if (!gdr) break;
     }
@@ -742,16 +730,12 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
       INFO(NCCL_GRAPH, "GDR is available on all GPUs");
     }
   }
-
   if (rcclParamEnableIntranet()) {
     remove = 0;
     system->type |= RCCL_TOPO_FORCE_INTRA;
   }
-  comm->localRanks = 0;
-  for (int n=0; n<system->nodes[GPU].count; n++ ) {
-    comm->localRanks += system->nodes[GPU].nodes[n].gpu.nRanksPerGpu;
-  }
-  if (comm->localRanks == comm->nRanks && remove) {
+  comm->localRanks = system->nodes[GPU].count;
+  if (system->nodes[GPU].count == comm->nRanks && remove) {
     for (int n=system->nodes[NET].count-1; n>=0; n--)
       NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
   }
@@ -808,8 +792,14 @@ static int nextPow2(int v) {
 
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
   /* here we already honor comm->max/minCTAs for p2pnChannels. */
-  comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
-  comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
+  if (comm->sharedRes->owner != comm) {
+    comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
+    comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels);
+  } else {
+    comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
+    comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
+  }
+
   int minChannels = comm->p2pnChannels;
   // We need to loop through all local GPUs to have a global picture
   for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
@@ -857,14 +847,10 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
   int nvbGpus = 0;
   for (int g=0; g<ngpus; g++) {
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    int j=0;
-    for ( ; j<gpu->gpu.nRanksPerGpu; j++ ){
-      if (gpu->gpu.rank[j] == rank) break;
-    }
-    if ( j == gpu->gpu.nRanksPerGpu ) continue;
+    if (gpu->gpu.rank != rank) continue;
     for (int p=0; p<ngpus; p++) {
       if (gpu->paths[GPU][p].type == PATH_NVB) {
-        (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank[j];
+        (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
       }
     }
   }
diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc
index 67e31a793b..f5ce15b168 100644
--- a/src/graph/rome_models.cc
+++ b/src/graph/rome_models.cc
@@ -691,7 +691,7 @@ ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct n
               if (g == system->nodes[GPU].nodes[j].gpu.dev)
                 break;
             if (j < ngpus)
-              graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank[0];
+              graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
             else
               return ncclInternalError;
           }
@@ -725,7 +725,7 @@ end:
   if (graph->id == 1) {
     for (int i=0; i<graph->nChannels; i++) {
       int net;
-      ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], &net);
+      ncclTopoGetLocalNet(system, graph->intra[i*ngpus+1], i, &net);
       graph->inter[i*2+1] = net;
     }
   }
@@ -788,7 +788,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str
               break;
           if (j < ngpus)
           {
-            graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank[0];
+            graph->treeBase[r][x] = system->nodes[GPU].nodes[j].gpu.rank;
             y=r;
           }
           else
@@ -926,15 +926,15 @@ ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGrap
       // find the first unsed GPU that is closest to NIC
       int f, m;
       for (f = 0; f < ngpus; f++) {
-        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank[0]) break;
+        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
         if(j >= n) break;
       }
       for (int i = 0; i < ngpus; i++) {
-        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank[0]) break;
+        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
         if (j < n) continue;
         if (paths[i].count < paths[f].count) f = i;
       }
-      for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank[0]) break;
+      for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
       used[n] = graph->intra[n*ngpus+m];
       for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
       for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
diff --git a/src/graph/search.cc b/src/graph/search.cc
index a8f840ce80..45364fb8ba 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -13,6 +13,8 @@
 #include <sys/time.h>
 #include "rome_models.h"
 
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
 // Initialize system->maxBw. This is the per-channel (i.e. per-SM)
 // max bw.
 static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
@@ -109,15 +111,26 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
   if (type1 == -1) return ncclSuccess;
   struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
   struct ncclTopoLinkList* path = node1->paths[type2]+index2;
+  struct ncclTopoNode* node2 = system->nodes[type2].nodes+index2;
+  struct ncclTopoLinkList* revPath = node2->paths[type1]+index1;
+
+  if (path == NULL) {
+    WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
+    return ncclInternalError;
+  }
   if (path->count == 0 ) return ncclSuccess;
 
   // Now check link type
   *node = NULL;
-  int intra = type1 == GPU && type2 == GPU;
+  int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
   float bw = intra ? graph->bwIntra : graph->bwInter;
   int type = intra ? graph->typeIntra : graph->typeInter;
 
   if (mult == 1 && (path->type > type)) return ncclSuccess;
+  if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
+        graph->pattern == NCCL_TOPO_PATTERN_TREE ||
+        graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) &&
+      (revPath->type > type)) return ncclSuccess;
 
   bw *= mult;
 
@@ -186,11 +199,9 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
 
 static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) {
   for (int g=0; g<system->nodes[GPU].count; g++) {
-    for (int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) {
-      if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
-	*index = g;
-	return ncclSuccess;
-      }
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *index = g;
+      return ncclSuccess;
     }
   }
   WARN("Could not find gpu rank %d", rank);
@@ -259,7 +270,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
 
 // Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16)
 #define NCCL_SEARCH_TIMEOUT (1<<14)
 #define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
 #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
@@ -272,13 +283,9 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
   if (graph->nChannels == 0) return ncclInternalError;
   int ngpus = system->nodes[GPU].count;
   int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
-  for (int i=0; i<ngpus; i++) {
-    for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
-      if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) {
-	    *g = i;
-	    return ncclSuccess;
-      }
-    }
+  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
+    *g = i;
+    return ncclSuccess;
   }
   if (*g == -1) return ncclInternalError;
   return ncclSuccess;
@@ -308,26 +315,18 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph
       int n = graph->intra[ngpus*c+((i+1)%ngpus)];
       struct ncclTopoNode *node;
       int j;
-      for (j=0; j<ngpus; j++) {
-	bool found=false;
-	for (int k=0; k<system->nodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) {
-	  if (system->nodes[GPU].nodes[j].gpu.rank[k] == g)
-	    found = true;
-	}
-	if (found) break;
-      }
+      for (j=0; j<ngpus; j++)
+        if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
       if (j<ngpus) {
         node = system->nodes[GPU].nodes+j;
         for (int k = 0; k<system->nodes[GPU].count; k++) {
           if (node->paths[GPU][k].count == 1) {
             struct ncclTopoLink* link = node->paths[GPU][k].list[0];
             struct ncclTopoNode* remNode = link->remNode;
-	    for (int l=0; l<remNode->gpu.nRanksPerGpu; l++) {
-	      if (remNode->gpu.rank[l] == n) {
-		if (link->type == LINK_NVL)
-		  count ++;
-	      }
-	    }
+            if (remNode->gpu.rank == n) {
+              if (link->type == LINK_NVL)
+                count ++;
+            }
           }
         }
       }
@@ -336,17 +335,57 @@ static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph
   return count;
 }
 
+ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
+  struct ncclTopoNode* nvs;
+  struct ncclTopoNode* gpu;
+  int d0=0; // See if there is enough bandwidth for NVS->GPU traffic
+  do {
+    NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
+    d0++;
+  } while (gpu && d0 < system->nodes[GPU].count);
+  if (gpu == NULL) {
+    d0--;
+  } else {
+    int d1=0; // See if there is enough bandwidth for GPU->NVS traffic
+    do {
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
+      d1++;
+    } while (nvs && d1 < system->nodes[GPU].count);
+    if (nvs == NULL) {
+      d1--;
+    } else { // Both directions worked. Move on to the next path.
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
+    }
+    while (d1) {
+      d1--;
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
+    }
+  }
+  while (d0) {
+    d0--;
+    NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
-  // 1. Constraint to get the same nChannels between Rings and Trees
+  // 1. Try to get the same nChannels between Rings and Trees
   if (graph->nChannels < graph->minChannels) return ncclSuccess;
 
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
+    if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
+    return ncclSuccess;
+  }
   // 2. Try to get better bandwidth
-  if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
-  if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
+  // Give a 15% perf bonus to paths not crossing nics
+  float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
+  if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
     *copy = 1;
     return ncclSuccess;
   }
-  // 3. Less hops (but not at the price of going cross NICs)
+  if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
+
+  // 3. Less hops
   if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
 
   // 4. Prefer graph with more XGMI connections
@@ -426,7 +465,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     graph->nChannels--;
     return ncclSuccess;
   }
-  graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank[0];
+  graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
   int g = gpu - system->nodes[GPU].nodes;
   if (step == backToNet) {
     // first get back to NIC
@@ -467,6 +506,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       }
       free(nets);
     }
+  } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+    NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
   } else if (step < system->nodes[GPU].count-1) {
     // Go to next GPU
     int next[NCCL_TOPO_MAX_NODES];
@@ -512,7 +553,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     struct ncclTopoNode* gpu;
     if (graph->collNet && net->net.collSupport == 0) continue;
     if (net->net.bw < bw) continue;
-    if (net->net.maxChannels == 0) continue;
 
     graph->inter[graph->nChannels*2] = net->id;
     graph->latencyInter = net->net.latency;
@@ -523,59 +563,63 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         system->nodes[NET].nodes[i].net.bw -= bw;
       }
     }
-    net->net.maxChannels--;
 
-    // First try to replay the last channel
-    if (graph->nChannels > 0) {
-      int g;
-      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
-      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
-    }
-    if (graph->nChannels == 0 || graph->sameChannels == 0) {
-      if (graph->nChannels == 0) {
-        // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
-        struct ncclTopoLinkList* paths = net->paths[GPU];
-        int f = 0, f_gdr = 0;
-        // find the first GPU that is closest to NIC
-        for (int i = 0; i<system->nodes[GPU].count; i++) {
-          if (paths[i].count <= paths[f].count) {
-            // prefer GPU direct RDMA
-            int gdr;
-            NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
-            if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
-              f = i;
-              f_gdr = gdr;
+    // NVLS needs to balance on all NICs
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
+    } else {
+      if (graph->nChannels > 0) {
+        // Try to replay the last channel
+        int g;
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
+      }
+      if (graph->nChannels == 0 || graph->sameChannels == 0) {
+        if (graph->nChannels == 0) {
+          // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
+          struct ncclTopoLinkList* paths = net->paths[GPU];
+          int f = 0, f_gdr = 0;
+          // find the first GPU that is closest to NIC
+          for (int i = 0; i<system->nodes[GPU].count; i++) {
+            if (paths[i].count <= paths[f].count) {
+              // prefer GPU direct RDMA
+              int gdr;
+              NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
+              if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
+                f = i;
+                f_gdr = gdr;
+              }
             }
           }
+          int t = 1 << 10;
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
+          if (t == -1) *time = -1;
         }
-        int t = 1 << 10;
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
-        if (t == -1) *time = -1;
-      }
 
-      // Then try the most local GPUs
-      float maxBw = 0;
-      int minHops = 0xfffffff;
-      struct ncclTopoLinkList* paths = net->paths[GPU];
-      for (int g=0; g<system->nodes[GPU].count; g++) {
-        if (paths[g].bw > maxBw) {
-          maxBw = paths[g].bw;
-          minHops = paths[g].count;
-        } else if (paths[g].bw == maxBw && paths[g].count < minHops) {
-          minHops = paths[g].count;
+        // Then try the most local GPUs
+        float maxBw = 0;
+        int minHops = 0xfffffff;
+        struct ncclTopoLinkList* paths = net->paths[GPU];
+        for (int g=0; g<system->nodes[GPU].count; g++) {
+          if (paths[g].bw > maxBw) {
+            maxBw = paths[g].bw;
+            minHops = paths[g].count;
+          } else if (paths[g].bw == maxBw && paths[g].count < minHops) {
+            minHops = paths[g].count;
+          }
         }
-      }
-      if (maxBw >= bw) {
-        // In the first loop, avoid using GPUs in both directions between channels (one channel
-        // sending from that GPU and one channel receiving to that GPU), since that usually leads
-        // to lower BW.
-        for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
-          for (int g=0; g<system->nodes[GPU].count; g++) {
-            if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              gpu = system->nodes[GPU].nodes+g;
-              int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
-              if (tryGpuBidir == gpuUsed) {
-                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
+        if (maxBw >= bw) {
+          // In the first loop, avoid using GPUs in both directions between channels (one channel
+          // sending from that GPU and one channel receiving to that GPU), since that usually leads
+          // to lower BW.
+          for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
+            for (int g=0; g<system->nodes[GPU].count; g++) {
+              if (paths[g].bw == maxBw && paths[g].count == minHops) {
+                gpu = system->nodes[GPU].nodes+g;
+                int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
+                if (tryGpuBidir == gpuUsed) {
+                  NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
+                }
               }
             }
           }
@@ -583,7 +627,6 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
 
-    net->net.maxChannels++;
     for (int i=0; i<system->nodes[NET].count; i++) {
       if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
           (system->nodes[NET].nodes[i].net.port == net->net.port)) {
@@ -634,7 +677,10 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
     ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
   } else {
     // Intra-node only.
-    if (graph->nChannels == 0) {
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
+      return ncclSuccess;
+    } else if (graph->nChannels == 0) {
       // Try PCI order first
       NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
     } else {
@@ -683,7 +729,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st
     } else if (strcmp(sub->name, "gpu") == 0) {
       int rank = -1;
       for (int g=0; g<ngpus; g++) {
-        if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank[0];
+        if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
       }
       if (rank == -1) {
         WARN("XML Import Channel : dev %d not found.", dev);
@@ -701,7 +747,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
 
   int crossNic;
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
-  if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
+  if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess;
   graph->crossNic = crossNic;
 
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
@@ -744,9 +790,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
     NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
     int dev = -1;
     for (int i=0; i<ngpus; i++) {
-       for ( int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
-	 if (system->nodes[GPU].nodes[i].gpu.rank[j] == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
-       }
+      if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
     }
     if (dev == -1) {
       WARN("XML Export Channel : rank %d not found.", intra[g]);
@@ -795,50 +839,39 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
+#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 #else
-float speedArrayIntra[] = { 44.0, 30.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
-float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
-#endif
+float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
+float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 
-RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
-NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
+float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
+#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
+#endif
 
-static void ncclExpandMultiRank(ncclTopoSystem* system, struct ncclTopoGraph* graph)
-{
-  // Expand the intra array to the multi-ranks per node scenario
-  int ngpus = system->nodes[GPU].count;
-  int intraCpy[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
-  TRACE(NCCL_GRAPH, "TopoCompute: expanding intra array for multi-rank per GPU scenarios nChannels %d", graph->nChannels);
-  memcpy(intraCpy, graph->intra, ngpus*sizeof(int)*graph->nChannels);
-  int tk=0;
-  for (int n=0; n<graph->nChannels; n++ ) {
-    for (int i=0; i<ngpus; i++) {
-      for (int j=0; j<ngpus; j++) {
-	if (intraCpy[n*ngpus+i] == system->nodes[GPU].nodes[j].gpu.rank[0] ) {
-	  for (int k=0; k<system->nodes[GPU].nodes[j].gpu.nRanksPerGpu; k++) {
-	    graph->intra[tk++] = system->nodes[GPU].nodes[j].gpu.rank[k];
-	  }
-	}
-      }
-    }
-  }
-}
+RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
 
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   graph->crossNic = ncclParamCrossNic();
-  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
+  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
+	 (graph->pattern == NCCL_TOPO_PATTERN_RING ||
+	  graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
+	  graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
   graph->bwIntra = graph->bwInter = 0;
   graph->latencyInter = 0;
   if (graph->crossNic == 2) graph->crossNic = 0;
   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
   graph->typeInter = PATH_PIX;
   graph->nChannels = 0;
-  graph->sameChannels = 1;
   graph->nIntraChannels = 0;
   memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int));
+  int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
+  graph->sameChannels = trySameChannels;
 
   char* str = getenv("NCCL_GRAPH_FILE");
   if (str) {
@@ -850,10 +883,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
     INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
     free(xml);
-    if (graph->nChannels > 0) {
-      ncclExpandMultiRank(system, graph);
-      return ncclSuccess;
-    }
+    if (graph->nChannels > 0) return ncclSuccess;
   }
 
   str = getenv("NCCL_RINGS");
@@ -866,29 +896,17 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   } else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
     // try to match 8P6L
     NCCLCHECK(parseChordalRing(system, graph));
-    if (graph->nChannels) {
-      ncclExpandMultiRank(system, graph);
-      return ncclSuccess;
-    }
+    if (graph->nChannels) return ncclSuccess;
     // try to match Rome 4P2H
     NCCLCHECK(parseRome4P2H(system, graph));
-    if (graph->nChannels) {
-      ncclExpandMultiRank(system, graph);
-      return ncclSuccess;
-    }
+    if (graph->nChannels) return ncclSuccess;
     // try to match 1H16P
     NCCLCHECK(parse1H16P(system, graph));
-    if (graph->nChannels) {
-      ncclExpandMultiRank(system, graph);
-      return ncclSuccess;
-    }
+    if (graph->nChannels) return ncclSuccess;
     // try to match 4H4P
     NCCLCHECK(parse4H4P(system, graph));
   }
-  if (graph->nChannels) {
-    ncclExpandMultiRank(system, graph);
-      return ncclSuccess;
-  }
+  if (graph->nChannels) return ncclSuccess;
 
   if ((graph->pattern == NCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
     // limit single node max channels when searching ring graph on Rome
@@ -898,6 +916,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
 
   int ccMin;
   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
+
+  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
+
+  if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+    // Force intra-node NVLS algorithm to pull evenly from all GPUs.
+    graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
+  }
 
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
@@ -914,7 +940,10 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   }
   int pass = 1;
   int speedIndex = 0;
-  while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
+  float maxBw = system->maxBw;
+  float totalBw = system->totalBw;
+  if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
+  while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
   tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
   int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
 
@@ -948,12 +977,19 @@ search:
       tmpGraph.sameChannels = 0;
       goto search;
     }
-    tmpGraph.sameChannels = 1;
+    tmpGraph.sameChannels = trySameChannels;
 
     if (time != -1) globalTimeout += time;
     else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
     if (globalTimeout < 0 && graph->nChannels) goto done;
 
+    // Try a simpler tree
+    if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+      goto search;
+    }
+    tmpGraph.pattern = graph->pattern;
+
     int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
     if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
@@ -974,20 +1010,13 @@ search:
     }
     tmpGraph.crossNic = 0;
 
-    // Try a simpler tree
-    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
-      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-      goto search;
-    }
-    tmpGraph.pattern = graph->pattern;
-
     // Decrease bw until we find a solution
     if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
       tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
       goto search;
     }
     speedIndex = 0;
-    while (speedArray[speedIndex] > system->maxBw && speedIndex < nspeeds-1) speedIndex++;
+    while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++;
     tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
 
   }
@@ -1016,24 +1045,26 @@ done:
     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
   }
 
-  if (graph->nChannels == 0 && graph->collNet == 0) {
+  if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
     WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
-    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank[0];
+    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
     graph->bwIntra = graph->bwInter = 0.1;
     graph->typeIntra = graph->typeInter = PATH_SYS;
     graph->nChannels = 1;
   }
 
-  if (graph->bwIntra >= 25.0) {
-    int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
-    memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
-    memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
-    graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
-    graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
-    graph->nChannels = dupChannels;
-  }
-  ncclExpandMultiRank(system, graph);
+  if (graph->nChannels == 0) return ncclSuccess;
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
+  if (graph->bwIntra < 25.0) return ncclSuccess;
+  if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
+
+  int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
+  memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
+  memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
+  graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
+  graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
+  graph->nChannels = dupChannels;
   return ncclSuccess;
 }
 
@@ -1085,23 +1116,40 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
   return ncclSuccess;
 }
 
+#include "comm.h"
+// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
+ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
+  int localRanks = comm->topo->nodes[GPU].count;
+  for (int c=0; c<graph->nChannels; c++) {
+    if (graph->intra[c*localRanks] == comm->rank) {
+      *dev = graph->inter[c*2];
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
+  return ncclInternalError;
+}
+
 // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
 NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
 
-#include "comm.h"
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
   if (graph) {
     // Honor the net device in the graph
     int channel = channelId%graph->nChannels;
     int ngpus = comm->topo->nodes[GPU].count;
     int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
-    *dev = graph->inter[channel*2+index];
+    if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
+      *dev = graph->inter[channel*2+index];
+    } else {
+      NCCLCHECK(getNvlsNetDev(comm, graph, dev));
+    }
     NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
   } else if (peerRank == -1) {
     return ncclInternalError;
   } else {
     // Start with our local NIC and local Rank
-    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
+    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev));
     *proxyRank = rank;
 
     int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
@@ -1111,7 +1159,9 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
       int cudaDev = comm->peerInfo[peerRank].cudaDev;
       int localRank;
       if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
-      int netDev = comm->peerInfo[localRank].netDev;
+      int netDev;
+      NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
+
       int n;
       // Check that device exists on our node
       if (ncclParamCrossNic() == 0) {
@@ -1131,20 +1181,17 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
           NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
         }
       } else if (pxnLevel == 2) {
-        // Check whether we can access it through our node-local GPU for that NIC.
-        for (int r=0; r<comm->localRanks; r++) {
-          int peerRank = comm->localRankToRank[r];
-          if (comm->peerInfo[peerRank].netDev == netDev) {
-            int g1, g2, n;
-            NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
-            NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
-            NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
-            struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
-            if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
-              *proxyRank = peerRank;
-              *dev = netDev;
-              return ncclSuccess;
-            }
+        // Check which local GPU corresponds to that NIC and see if we can use PXN.
+        int n, g1, g2;
+        NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
+        NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2));
+        if (g2 != -1) {
+          struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
+          if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
+            *proxyRank = peerGpu->gpu.rank;
+            *dev = netDev;
+            return ncclSuccess;
           }
         }
       }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index d41293f7dc..bd4c75310f 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -117,10 +117,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
     n->links[0].remNode = n;
     n->links[0].bw = LOC_BW;
     n->gpu.dev = NCCL_TOPO_UNDEF;
-    for (int i=0; i<RCCL_TOPO_MAX_RANKS_PER_GPU; i++) {
-      n->gpu.rank[i] = NCCL_TOPO_UNDEF;
-    }
-    n->gpu.nRanksPerGpu = NCCL_TOPO_UNDEF;
+    n->gpu.rank = NCCL_TOPO_UNDEF;
     n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
   } else if (type == CPU) {
     n->cpu.arch = NCCL_TOPO_UNDEF;
@@ -256,15 +253,7 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
 
 static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
   if (node->type == GPU) {
-    sprintf(line+offset, "%s/%lX (%d", topoNodeTypeStr[node->type], node->id, node->gpu.rank[0]);
-    int nextOffset;
-    int nextRank = 1;
-    while ( nextRank < node->gpu.nRanksPerGpu ) {
-      nextOffset = strlen(line);
-      sprintf(line+nextOffset, "/%d", node->gpu.rank[nextRank++]);
-    }
-    nextOffset = strlen(line);
-    sprintf(line+nextOffset, ")");
+    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
   } else if (node->type == CPU) {
     sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
   } else if (node->type == PCI) {
@@ -384,17 +373,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
   rcclHipDeviceArch_t arch;
   NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
   memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
-
-  //NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
-  const char *rankStr;
-  NCCLCHECK(xmlGetAttrStr(xmlGpu, "rank", &rankStr));
-  char *tmpStr;
-  char *token = strtok_r ( (char *)rankStr, ",", &tmpStr);
-  gpu->gpu.nRanksPerGpu = 0;
-  while (token != NULL && gpu->gpu.nRanksPerGpu < RCCL_TOPO_MAX_RANKS_PER_GPU) {
-    gpu->gpu.rank[gpu->gpu.nRanksPerGpu++] = atoi(token);
-    token = strtok_r(NULL, ",", &tmpStr);
-  }
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
   NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
   NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
   // Do not go any further, nvlinks will be added in a second pass
@@ -406,7 +385,6 @@ struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
   { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
-
 ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
   const char* str;
 
@@ -716,8 +694,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
       NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
       if (node == NULL) continue;
       NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
-      //NCCLCHECK(xmlSetAttrInt(node, "rank", r));
-      NCCLCHECK(xmlSetOrAppendAttrInt(node, "rank", r));
+      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
       NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
     }
   }
@@ -744,11 +721,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     }
   }
   if (netDevCount == 0) {
-    NCCLCHECK(ncclNetDevices(comm, &netDevCount));
+    NCCLCHECK(comm->ncclNet->devices(&netDevCount));
   }
   for (int n=0; n<netDevCount; n++) {
     ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(comm, n, &props));
+    NCCLCHECK(comm->ncclNet->getProperties(n, &props));
     struct ncclXmlNode* netNode;
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
     NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -777,10 +754,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
-  int g;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
-  int minType = PATH_SYS;
+static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
+  int minType = PATH_DIS;
   float maxBw = 0;
   int count = 0;
   int* nets;
@@ -790,20 +765,115 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* i
     if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
       maxBw = path->bw;
       minType = path->type;
+      if (type) *type = minType;
       count = 0;
     }
     if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
   }
-  if (count == 0) {
-    *id = -1;
-    free(nets);
+
+  *localNetMask = 0ULL;
+  for (int n=0; n<count; n++) {
+    if (nets[n] >= 64) return ncclInternalError;
+    *localNetMask |= 1ULL<<nets[n];
+  }
+  free(nets);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
+  uint64_t* localNetMasks;
+  int ngpus = system->nodes[GPU].count;
+  NCCLCHECK(ncclCalloc(&localNetMasks, ngpus));
+
+  // Fill localNetMasks for all GPUs.
+  for (int g=0; g<ngpus; g++) {
+    NCCLCHECK(getLocalNetMask(system, g, localNetMasks+g, NULL));
+  }
+
+  // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
+  int gpu;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
+  int netLocalGpus = 0, netLocalGpu = 0;
+  for (int g=0; g<ngpus; g++) {
+    if (localNetMasks[g] == localNetMasks[gpu]) {
+      if (g == gpu) netLocalGpu = netLocalGpus;
+      netLocalGpus++;
+    }
+  }
+  uint64_t localNetMask = localNetMasks[gpu];
+  free(localNetMasks);
+  if (localNetMask == 0) return ncclInternalError;
+
+  // Round robin on GPUs and channels
+  int gIndex = 0, cId = 0, n = 0;
+  while (1) {
+    if (1ULL << n & localNetMask) {
+      if (gIndex == netLocalGpu && cId == channelId) {
+        *id = n;
+        return ncclSuccess;
+      }
+      gIndex++;
+      if (gIndex == netLocalGpus) {
+        gIndex = 0;
+        cId++;
+      }
+    }
+    n = (n+1) % 64;
+  }
+}
+
+ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
+  int ngpus = system->nodes[GPU].count;
+  int* gpus;
+  NCCLCHECK(ncclCalloc(&gpus, ngpus));
+
+  // Find localNetMask which includes net with the most local GPUs.
+  int netLocalGpus = 0, minType = PATH_DIS;
+  uint64_t localNetMask = 0ULL;
+  for (int g=0; g<ngpus; g++) {
+    int type = PATH_DIS;
+    uint64_t mask;
+    NCCLCHECK(getLocalNetMask(system, g, &mask, &type));
+    if ((1ULL<<net) & mask) {
+      if (type < minType) {
+        localNetMask = mask;
+        netLocalGpus = 0;
+        minType = type;
+      }
+      if (type == minType) {
+        if (localNetMask && mask != localNetMask) {
+          WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus-1], minType, net, mask, localNetMask);
+          free(gpus);
+          return ncclInternalError;
+        }
+        gpus[netLocalGpus] = g;
+        netLocalGpus++;
+      }
+    }
+  }
+  if (localNetMask == 0ULL) {
+    *gpuIndex = -1;
+    free(gpus);
     return ncclSuccess;
   }
 
-  int rr = system->nodes[GPU].nodes[g].gpu.dev;
-  *id = nets[rr%count];
-  free(nets);
-  return ncclSuccess;
+  // Round robin on GPUs and channels
+  int gIndex = 0, cId = 0, n = 0;
+  while (1) {
+    if (1ULL << n & localNetMask) {
+      if (n == net) {
+        *gpuIndex = gpus[gIndex];
+        free(gpus);
+        return ncclSuccess;
+      }
+      gIndex++;
+      if (gIndex == netLocalGpus) {
+        gIndex = 0;
+        cId++;
+      }
+    }
+    n = (n+1) % 64;
+  }
 }
 
 /****************************/
@@ -822,20 +892,18 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
   struct ncclTopoNode* cpu = NULL, *gpu = NULL;
   for (int g=0; g<system->nodes[GPU].count; g++) {
-    for (int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++) {
-      if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
-	gpu = system->nodes[GPU].nodes+g;
-	// Find closer CPU
-	int cpuIndex = -1, minHops = 0;
-	for (int c=0; c<system->nodes[CPU].count; c++) {
-	  int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
-	  if (cpuIndex == -1 || nHops < minHops) {
-	    cpuIndex = c;
-	    minHops = nHops;
-	  }
-	}
-	cpu = system->nodes[CPU].nodes+cpuIndex;
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      gpu = system->nodes[GPU].nodes+g;
+      // Find closer CPU
+      int cpuIndex = -1, minHops = 0;
+      for (int c=0; c<system->nodes[CPU].count; c++) {
+        int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+        if (cpuIndex == -1 || nHops < minHops) {
+          cpuIndex = c;
+          minHops = nHops;
+        }
       }
+      cpu = system->nodes[CPU].nodes+cpuIndex;
     }
   }
   if (cpu == NULL) {
@@ -885,6 +953,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[GPU].count;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
   *count = system->nodes[NET].count;
   return ncclSuccess;
@@ -910,11 +983,9 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
 
 ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
   for (int g=0; g<system->nodes[GPU].count; g++) {
-    for ( int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){
-      if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
-	    *localRank = g;
-	    return ncclSuccess;
-      }
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *localRank = g;
+      return ncclSuccess;
     }
   }
   WARN("Could not find local GPU with rank %d", rank);
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 72a294837c..cc995d4ccf 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -13,12 +13,13 @@
 
 #define LOC_BW 5000.0
 #define SM60_NVLINK_BW 18.0
-#define SM70_NVLINK_BW 22.0
-#define SM80_NVLINK_BW 22.0
+#define SM70_NVLINK_BW 20.0
+#define SM80_NVLINK_BW 20.0
+#define SM90_NVLINK_BW 20.0
 #define SM86_NVLINK_BW 12.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
-#define SKL_QPI_BW 9.0
+#define SKL_QPI_BW 10.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -75,7 +76,12 @@ extern const char* topoLinkTypeStr[];
 
 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
 #define PATH_SYS 7
-#define PATH_DIS 7
+
+// Connection through the network
+#define PATH_NET 8
+
+// Disconnected
+#define PATH_DIS 9
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -106,7 +112,6 @@ struct ncclTopoLinkList {
 #define RCCL_TOPO_FORCE_INTRA 16
 #define RCCL_TOPO_XGMI_ALL  32
 
-#define RCCL_TOPO_MAX_RANKS_PER_GPU 8
 struct ncclTopoNode {
   int type;
   int64_t id;
@@ -114,8 +119,7 @@ struct ncclTopoNode {
   union {
     struct {
       int dev; // NVML dev number
-      int rank[RCCL_TOPO_MAX_RANKS_PER_GPU];
-      int nRanksPerGpu;
+      int rank;
       int cudaCompCap;
       int gdrSupport;
       int gcn;
@@ -198,11 +202,9 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
 static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
   *index = -1;
   for (int i=0; i<system->nodes[GPU].count; i++) {
-    for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
-      if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) {
-	    *index = i;
-	    return ncclSuccess;
-      }
+    if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
+      *index = i;
+      return ncclSuccess;
     }
   }
   return ncclInternalError;
@@ -212,7 +214,7 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
   *rank = -1;
   for (int i=0; i<system->nodes[GPU].count; i++) {
     if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
-      *rank = system->nodes[GPU].nodes[i].gpu.rank[0];
+      *rank = system->nodes[GPU].nodes[i].gpu.rank;
       return ncclSuccess;
     }
   }
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 1a6d04c772..7ab30f3575 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -54,7 +54,10 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } };
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { 
+       { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 },   // Tree, Ring
+       { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 },   // Collnet Direct, Chain
+       {    0,    0,    0 }, {    0,    0,    0 }};  // NVLS, NVLS Tree
 
 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -71,18 +74,18 @@ struct tuningModel {
 static struct tuningModel tuning_model_0 {
   .hwLat = {
     /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .bwRatio = {
     /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .treeCorrectionFactor = {
@@ -101,18 +104,18 @@ static struct tuningModel tuning_model_0 {
 static struct tuningModel tuning_model_1 {
   .hwLat =
   { /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .bwRatio =
   { /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .treeCorrectionFactor = {
@@ -131,18 +134,18 @@ static struct tuningModel tuning_model_1 {
 static struct tuningModel tuning_model_2 {
   .hwLat = {
     /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .bwRatio = {
     /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .treeCorrectionFactor = {
@@ -161,18 +164,18 @@ static struct tuningModel tuning_model_2 {
 static struct tuningModel tuning_model_3 {
   .hwLat = {
     /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .bwRatio = {
     /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .treeCorrectionFactor = {
@@ -191,18 +194,18 @@ static struct tuningModel tuning_model_3 {
 static struct tuningModel tuning_model_4 {
   .hwLat = {
     /* NVLINK */
-    { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* PCI */
-    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* NET */
-    { /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 } },
+    { /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .bwRatio = {
     /* 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
     /* more than 2 nodes */
-    { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
+    { /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
   },
 
   .treeCorrectionFactor = {
@@ -232,21 +235,42 @@ static struct tuningModel rcclTuningModel[] = {
 #define HOPPER_COMPCAP_IDX 2
 
 // LL128 max BW per channel
-static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
 static const double llMaxBws[3][3] = {
   /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
   /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
   /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}
 };
 
+static const double perChMaxRingLL128Bws[3][3] = {
+  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
+  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
+  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
+};
+static const double perChMaxTreeLL128Bws[3][3] = {
+  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
+  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
+  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
+};
 static const double perChMaxTreeBws[3][3] = {
-  /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
+  /* Volta (N1/N2/N4) */  {26.5, 18.5, 10.0},
   /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
-  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
+  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
 };
 
-ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
-  int simpleDefaultThreads = (ringGraph->bwIntra*ringGraph->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
+// Network post overhead in ns (1000 = 1 us)
+NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
+
+static float getNetOverhead(struct ncclComm* comm) {
+  if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
+  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
+  else return 1.0;
+}
+
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
+  int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
@@ -262,7 +286,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
   comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
     comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
-    comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
+    comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] =
+    comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@@ -281,11 +306,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
   double llMaxBw = llMaxBws[index1][index2];
   double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
+  double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
+  double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
   //if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
 
-  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@@ -299,10 +325,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       nNodes;
 
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if ((coll != ncclFuncAllReduce) && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
+        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
         float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
@@ -315,13 +344,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         else
           busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
 #else
-        if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
-        if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
         if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
+        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -331,12 +359,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           busBw /= factor;
         }
 #endif
-        if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
+        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;
 
         // Convert bus BW to algorithm BW
         float ratio;
         if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
-        else if (a == NCCL_ALGO_NVLS) ratio = .75;
+        else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0;
+        else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
         else ratio = .5;
         comm->bandwidths[coll][a][p] = busBw * ratio;
 
@@ -344,16 +373,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
         float interLat =  graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
         //if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
+        if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
+
         if (a == NCCL_ALGO_RING) {
           float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
           if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
-            if (ringGraph->sameChannels) {
+            if (graphs[a]->sameChannels) {
               comm->latencies[coll][a][p] += lat;
             } else {
               if (p == NCCL_PROTO_SIMPLE) lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
               comm->latencies[coll][a][p] += nsteps*lat;
             }
           } else {
+            // Inter-node rings still have to launch nsteps * net overhead.
+            float netOverhead = 0.0;
+            if (nNodes > 1) {
+              netOverhead = getNetOverhead(comm);
+              if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
+            }
+            intraLat = std::max(intraLat, netOverhead);
             comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
           }
         } else if (a == NCCL_ALGO_TREE) {
@@ -363,7 +401,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           comm->latencies[coll][a][p] +=
             2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat;  // Add 0.5 arity serialization latency
         } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
-          comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat;
+          comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
+        } else if (a == NCCL_ALGO_NVLS) {
+          if (nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
+        } else if (a == NCCL_ALGO_NVLS_TREE) {
+          comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
         }
       }
     }
@@ -372,7 +414,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
   int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
 
   const char *protoStr = getenv("NCCL_PROTO");
   if (protoStr) {
@@ -385,15 +427,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
   }
 
-  // Disable NVLink SHARP if not supported
-  if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
+  if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
 
   // Disable CollNet if it is not supported
   if (comm->collNetSupport == 0) {
     algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
     algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
+    if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
     // If user has hard set NCCL_ALGO=COLLNET, ignore it
-    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
+    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
+        algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
       algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
       if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
     }
@@ -415,7 +458,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
       pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
-      pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
+      pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
       pEnable &= (minCompCap == maxCompCap);
       switch (minCompCap) {
       case 70: pEnable &= 1; break;
@@ -433,28 +476,38 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
   if (comm->rank == 0) {
     char line[1024];
-    sprintf(line, "Latency/AlgBw |");
-    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+    for (int block=0; block<2; block++) {
+      sprintf(line, "  Algorithm   |");
+      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
+	int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+        sprintf(line+strlen(line), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
       }
-    }
-    INFO(NCCL_TUNING, "%s", line);
-    sprintf(line, " Max NThreads |");
-    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
-      }
-    }
-    INFO(NCCL_TUNING, "%s", line);
-    for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
-      sprintf(line, "%13s |", ncclFuncStr[c]);
-      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      INFO(NCCL_TUNING, "%s", line);
+      sprintf(line, "  Protocol    |");
+      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+          sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
+      sprintf(line, " Max NThreads |");
+      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
+	int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
+        }
+      }
+      INFO(NCCL_TUNING, "%s", line);
+      for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
+        sprintf(line, "%13s |", ncclFuncStr[c]);
+        for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
+	  int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+          for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+            sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+          }
+        }
+        INFO(NCCL_TUNING, "%s", line);
+      }
     }
   }
 
@@ -514,7 +567,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
   if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
   if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
   if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
-      && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
+      && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
+    lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+  }
 #endif
   // Tree pipelining saves latency in aggregation cases
   int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 4780de4eec..138d48e401 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -789,8 +789,8 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
   }
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
 #else
-  nvmlDevice_t nvmlDev = NULL;
-  if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  nvmlDevice_t nvmlDev;
+  NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
 #endif
   return ncclSuccess;
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 64f0b4cb8f..5ffa6c90c5 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -178,25 +178,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
   return ncclSuccess;
 }
 
-static ncclResult_t xmlSetOrAppendAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
-  int index;
-  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
-  if (index == -1) {
-    index = node->nAttrs++;
-    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
-    node->attrs[index].key[MAX_STR_LEN] = '\0';
-    snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
-    node->attrs[index].value[MAX_STR_LEN] = '\0';
-    return ncclSuccess;
-  }
-  char *tmp = strdup(node->attrs[index].value);
-  snprintf(node->attrs[index].value, MAX_STR_LEN, "%s,%d", tmp, value);
-  node->attrs[index].value[MAX_STR_LEN] = '\0';
-  free (tmp);
-  return ncclSuccess;
-}
-
-
 static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
   int index;
   NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
diff --git a/src/group.cc b/src/group.cc
index 477b34ed32..b266654be9 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -45,13 +45,14 @@ ncclResult_t ncclAsyncLaunch(
     job->undo = undo;
     job->destructor = destructor;
     job->abortFlag = comm->abortFlag;
+    job->childAbortFlag = comm->childAbortFlag;
     job->state = ncclGroupJobRunning;
     job->comm = comm;
     /* check if there are blocking and nonblocking comms at the same time in group. */
     if (ncclGroupBlocking == -1) {
       /* first met communicator */
-      ncclGroupBlocking = comm->blocking;
-    } else if (ncclGroupBlocking != comm->blocking) {
+      ncclGroupBlocking = comm->config.blocking;
+    } else if (ncclGroupBlocking != comm->config.blocking) {
       WARN("Blocking and nonblocking communicators are not allowed in the same group.");
       ret = ncclInvalidArgument;
     }
@@ -87,23 +88,20 @@ ncclResult_t ncclGroupStart() {
   ncclResult_t ret = ncclSuccess;
   NVTX3_FUNC_RANGE_IN(nccl_domain);
 
-  /* if previous group launch does not complete, don't launch this one. */
-  if (ncclGroupJobMainPtr != NULL) {
-    if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
-      ret = ncclInvalidUsage;
-      goto exit;
-    } else {
-      NCCLCHECKGOTO(groupJobComplete(ncclGroupJobMainPtr), ret, exit);
-    }
-  }
   NCCLCHECK(ncclGroupStartInternal());
   TRACE_CALL("ncclGroupStart()");
-
-exit:
   return ret;
 }
 
-ncclResult_t ncclGroupStartInternal() {
+inline ncclResult_t ncclGroupStartInternal() {
+  /* if previous group launch does not complete, don't launch this one. */
+  if (ncclGroupJobMainPtr != NULL) {
+    if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
+      return ncclInvalidUsage;
+    } else {
+      NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr));
+    }
+  }
   ncclGroupDepth++;
   if (mscclAvailable() && !mscclIsCaller()) {
     NCCLCHECK(mscclGroupStart());
@@ -204,13 +202,6 @@ failure:
   return result;
 }
 
-static inline void groupResetJobState() {
-  ncclGroupBlocking = -1;
-  ncclGroupJobMainPtr = NULL;
-  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
-  return;
-}
-
 static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
   struct ncclComm* comm = *groupCommHeadPtr;
 
@@ -255,7 +246,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
       ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
     }
 
-    if (!comm->blocking)
+    if (!comm->config.blocking)
       (void) ncclCommSetAsyncError(comm, error);
     comm = next;
   }
@@ -264,7 +255,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
   while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
     *job->abortFlag = 1;
-    if (job->comm && !job->comm->blocking)
+    if (job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, error);
     if (job->undo) job->undo(job);
     if (job->destructor) job->destructor((void*)job);
@@ -339,6 +330,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
 
         if (*groupAbortFlag == true || errorJobAbortFlag == true) {
           *job->abortFlag = 1;
+          if (job->childAbortFlag) *job->childAbortFlag = 1;
         }
 
         job = job->next;
@@ -359,7 +351,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
 
   while (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
-    if (job->comm && !job->comm->blocking)
+    if (job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, ret);
     if (job->destructor) job->destructor((void*)job);
   }
@@ -368,7 +360,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
     struct ncclComm* comm = groupCommHeadMain;
     struct ncclComm* next = comm->groupNext;
     (void) ncclGroupCommLeave(comm);
-    if (!comm->blocking) {
+    if (!comm->config.blocking) {
       (void) ncclCommSetAsyncError(comm, ret);
     }
     groupCommHeadMain = next;
@@ -449,15 +441,6 @@ fail:
   goto exit;
 }
 
-static ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
-  ncclResult_t ret = ncclSuccess;
-  if (job) {
-    ret = ncclAsyncJobComplete(&job->base);
-    groupResetJobState();
-  }
-  return ret;
-}
-
 void ncclGroupJobAbort() {
   ncclGroupJobAbortFlag = true;
   (void) groupJobComplete(ncclGroupJobMainPtr);
diff --git a/src/include/align.h b/src/include/align.h
index e3780fe52c..2a71dd1bc3 100644
--- a/src/include/align.h
+++ b/src/include/align.h
@@ -13,6 +13,9 @@
 #define ROUNDUP(x, y) \
     (DIVUP((x), (y))*(y))
 
+#define ALIGN_POWER(x, y) \
+    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
+
 #define ALIGN_SIZE(size, align) \
   size = ((size + (align) - 1) / (align)) * (align);
 
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 262d0cbb9e..4f47be44d9 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -12,6 +12,7 @@
 #include "checks.h"
 #include "align.h"
 #include "utils.h"
+#include "p2p.h"
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
 #define MAX_ALLOC_TRACK_NGPU 32
 extern struct allocationTracker allocTracker[];
 
+#if CUDART_VERSION >= 11030
+
+#include <cuda.h>
+#include "cudawrap.h"
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+  ncclResult_t result = ncclSuccess;
+  size_t granularity = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp prop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int flag = 0;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
+  prop.location.id = currentDev;
+  // Query device to see if RDMA support is available
+  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+  if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
+  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  ALIGN_SIZE(size, granularity);
+  /* Allocate the physical memory on the device */
+  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
+  /* Reserve a virtual address range */
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+  /* Map the virtual address range to the physical allocation */
+  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+  /* Now allow RW access to the newly mapped memory */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = currentDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+  if (handlep) *handlep = handle;
+  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
+  return result;
+}
+
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  if (ptr == NULL) return ncclSuccess;
+  ncclResult_t result = ncclSuccess;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  return result;
+}
+
+#else
+
+extern int ncclCuMemEnable();
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+
+#endif
+
 template <typename T>
 ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
   ncclResult_t result = ncclSuccess;
@@ -193,8 +265,13 @@ template <typename T>
 ncclResult_t ncclCudaFree(T* ptr) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaFree(ptr), result, finish);
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaFree(ptr), result, finish);
+  }
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   return result;
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 2ecea7a94f..400a479fbe 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
 ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
+ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
diff --git a/src/include/channel.h b/src/include/channel.h
index 0ebb5a2734..adc38749a5 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -9,7 +9,9 @@
 #include "comm.h"
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
-ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
 static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
   int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
   int peerNode = comm->rankToNode[peer];
diff --git a/src/include/checks.h b/src/include/checks.h
index 048fc06e9b..c9fd16176e 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -18,11 +18,11 @@
     }                                                       \
 } while(false)
 
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
+#define CUDACHECKGOTO(cmd, RES, label) do {                 \
     cudaError_t err = cmd;                                  \
     if( err != cudaSuccess ) {                              \
         WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        res = ncclUnhandledCudaError;                       \
+        RES = ncclUnhandledCudaError;                       \
         goto label;                                         \
     }                                                       \
 } while(false)
@@ -60,11 +60,11 @@
   } \
 } while(true)
 
-#define SYSCHECKGOTO(statement, res, label) do { \
+#define SYSCHECKGOTO(statement, RES, label) do { \
   if ((statement) == -1) {    \
     /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
     goto label; \
   } \
 } while (0);
@@ -72,16 +72,16 @@
 #define NEQCHECK(statement, value) do {   \
   if ((statement) != value) {             \
     /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
     return ncclSystemError;     \
   }                             \
 } while (0);
 
-#define NEQCHECKGOTO(statement, value, res, label) do { \
+#define NEQCHECKGOTO(statement, value, RES, label) do { \
   if ((statement) != value) { \
     /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
     goto label; \
   } \
 } while (0);
@@ -89,57 +89,57 @@
 #define EQCHECK(statement, value) do {    \
   if ((statement) == value) {             \
     /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
     return ncclSystemError;     \
   }                             \
 } while (0);
 
-#define EQCHECKGOTO(statement, value, res, label) do { \
+#define EQCHECKGOTO(statement, value, RES, label) do { \
   if ((statement) == value) { \
     /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
     goto label; \
   } \
 } while (0);
 
 // Propagate errors up
 #define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess && res != ncclInProgress) { \
+  ncclResult_t RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
     /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return RES; \
   } \
 } while (0);
 
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess && res != ncclInProgress) { \
+#define NCCLCHECKGOTO(call, RES, label) do { \
+  RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
     /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     goto label; \
   } \
 } while (0);
 
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
-  ncclResult_t res = call;                \
-  if (res != ncclSuccess && res != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+  ncclResult_t RES = call;                \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     return ncclInternalError;             \
   }                                       \
   if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
 } while (!(cond));
 
-#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
   volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
-  res = call;                             \
-  if (res != ncclSuccess && res != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+  RES = call;                             \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     goto label;                           \
   }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
 } while (!(cond));
 
 #define NCCLCHECKTHREAD(a, args) do { \
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 0fb2badb66..bda4be7f71 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -63,11 +63,12 @@ struct ncclDevRedOpFull {
   MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
 
 #define DECL3(func, devredop, type, undef) \
-  DECL4(func, RING,    devredop, type, undef) \
-  DECL4(func, TREE,    devredop, type, undef) \
+  DECL4(func, RING,           devredop, type, undef) \
+  DECL4(func, TREE,           devredop, type, undef) \
   DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
-  DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
-  DECL4(func, NVLS,    devredop, type, undef)
+  DECL4(func, COLLNET_CHAIN,  devredop, type, undef) \
+  DECL4(func, NVLS,           devredop, type, undef) \
+  DECL4(func, NVLS_TREE,      devredop, type, undef)
 
 #if defined(RCCL_BFLOAT16)
 #define DECL2(func, devredop, undefForFloat) \
diff --git a/src/include/comm.h b/src/include/comm.h
index dac5cc8f53..2062de3fca 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -101,19 +101,51 @@ struct ncclCommCallback {
   ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
 };
 
+struct ncclSharedResources {
+  int refCount;
+  struct ncclComm* owner; /* comm which creates this shared res. */
+  struct ncclChannelPeer* peers[MAXCHANNELS];
+  struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
+  /* P2P operation counter, one per channel */
+  uint64_t p2pOpCount[MAXCHANNELS];
+  /* Collective operation counter */
+  uint64_t collOpCount;
+  int tpNRanks;
+  int tpNLocalRanks;
+  int tpNChannels;
+  int tpP2pNChannels;
+  int tpP2pChunkSize;
+  uint64_t magic;
+
+  // top parent rank to localRank translation table
+  int* tpRankToLocalRank;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;
+
+  /* proxy related shared res */
+  struct ncclProxyState* proxyState;
+};
+
 struct ncclChannel {
-  struct ncclChannelPeer* peers;
-  struct ncclDevChannelPeer* devPeers;
+  struct ncclChannelPeer** peers;
+  struct ncclDevChannelPeer** devPeers;
   struct ncclRing ring;
   int* devRingUserRanks;
   struct ncclTree tree;
+
   struct ncclTree collnetChain;
   struct ncclDirect collnetDirect;
   struct ncclTree binTree;
   struct ncclNvls nvls;
+
   int id; // index of this channel
   uint32_t workFifoSent; // last used work index+1
-  uint64_t p2pOpCount;
+
+  /* comm split sharable resources */
+  struct ncclChannelPeer* collnetPeers;
+  struct ncclDevChannelPeer* collnetDevPeers;
+  struct ncclChannelPeer* nvlsPeers;
+  struct ncclDevChannelPeer* nvlsDevPeers;
 };
 
 struct ncclWorkList {
@@ -167,6 +199,10 @@ struct ncclComm {
   // List of destructors to run when comm is destructed
   struct ncclDestructor* destructorHead;
 
+  struct ncclSharedResources* sharedRes;
+  /* map to top parent ranks. */
+  int* topParentRanks;
+  int* topParentLocalRanks;
   struct ncclChannel channels[MAXCHANNELS];
   struct ncclPeerInfo* peerInfo;
   struct ncclTopoSystem* topo;
@@ -180,15 +216,16 @@ struct ncclComm {
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
 
+  uint64_t commHash;
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
   int cudaDev; // my cuda device index
+  //int nvmlDev; // my nvml device index
   int compCap; // compute capability of the GPU
-  int minCompCap; // min compute capability in the communicator
+  int minCompCap, maxCompCap; // min/max compute capability in the communicator
   int64_t busId;   // my PCI bus ID in int format
   cpu_set_t cpuAffinity; // CPU affinity of the GPU
   int WarpSize;
-  int virtualId;
   int cudaArch; // matches __CUDA_ARCH__ of device
 
   int node;
@@ -207,12 +244,11 @@ struct ncclComm {
 
   // Counter for tracking CUDA launches (P2P and collectives included)
   uint64_t opCount;
-  // Collective operation counter
-  uint64_t collOpCount;
 
   // Channels for collectives
   int nChannels;
   int nvlsChannels;
+  int collNetChannels;
   // Channels (per peer) for p2p
   int p2pnChannels;
   int p2pnChannelsPerPeer;
@@ -237,6 +273,8 @@ struct ncclComm {
 
   // Flag to ask NCCL kernels to abort
   volatile uint32_t *abortFlag;
+  volatile uint32_t *childAbortFlag;
+  uint32_t *abortFlagRefCount;
 
   // Flags for enable P2P NET
   uint32_t p2pNet;
@@ -268,21 +306,24 @@ struct ncclComm {
   char intraPad2[64 - sizeof(uint64_t)];
   uint64_t intraBarrierGate; // only used if this is intraComm0
 
-  struct ncclProxyState proxyState;
-
+  struct ncclProxyState* proxyState;
+  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
   // Whether this communicator uses collNet
   int collNetSupport;
+  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
   int intraHighestTransportType;
+  int* collNetHeads;
+  int collNetHeadsNum;
+  /* sharable collNet proxy progress resource. */
+  struct ncclCollNetSharedRes* collNetSharedRes;
 
   // NVLink SHARP (NVLS) support
   int nvlsSupport;
-  void* nvlsResources;
+  /* sharable NVLS resource. */
+  struct ncclNvlsSharedRes* nvlsResources;
 
   size_t channelSize; // User requested work size (bytes) for channel partitions
 
-  // Internal streams
-  struct ncclStrongStream deviceStream, hostStream;
-
   // pools backed by comm->memPermanent
   struct ncclMemoryPool memPool_ncclProxyOp;
   struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -319,13 +360,7 @@ struct ncclComm {
   volatile bool collTraceExit;
 #endif
 
-  // communicator mode
-  int blocking;
-  // CGA cluster size
-  int cgaClusterSize;
-  int minCTAs, maxCTAs;
-  // network interface name
-  char *netName;
+  ncclConfig_t config;
   // initState is to more conveniently reclaim resources when errors happen.
   ncclResult_t initState;
   // flag to indicate if ncclCommFinalize() is called
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index 317ca2df6d..da9ce45a4f 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -11,6 +11,9 @@
 #include <cuda_runtime.h>
 #include "checks.h"
 
+// Is cuMem API usage enabled
+extern int ncclCuMemEnable();
+
 #if CUDART_VERSION >= 11030
 #include <cudaTypedefs.h>
 #else
@@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
 DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
 #if CUDA_VERSION >= 11070
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 1c80a3f32c..49a32f148c 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -21,12 +21,13 @@
 typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
 
-#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
 #define NCCL_ALGO_COLLNET_DIRECT 2
 #define NCCL_ALGO_COLLNET_CHAIN 3
 #define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -112,10 +113,10 @@ struct ncclConnInfo {
 };
 
 struct ncclProxyConnector {
-  int rank;
-  int localRank;
+  int tpRank;
+  int tpLocalRank;
+  int sameProcess;
   struct ncclProxyConnection* connection;
-  struct ncclComm* comm;
 };
 
 struct ncclConnector {
@@ -124,7 +125,6 @@ struct ncclConnector {
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
-  struct ncclComm *comm;
 };
 
 struct ncclRing {
@@ -141,6 +141,9 @@ struct ncclRing {
 };
 
 
+// The root of each tree only has one node down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
 #define NCCL_MAX_TREE_ARITY 3
 struct ncclTree {
   int depth;
@@ -161,18 +164,24 @@ struct ncclDirect {
 
 #define NCCL_CONN_IDX_P2P_NET 2
 #define NCCL_MAX_NVLS_ARITY 8
+#define NCCL_MAX_NVLS_TREE_ARITY 3
 struct ncclNvls {
   int out;
   int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
   int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
   int up[NCCL_MAX_NVLS_ARITY];
   int down;
+  int treeUp;
+  int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
+  int node;
+  int nNodes;
 };
 
 #define NCCL_MAX_CONNS 3
 struct ncclChannelPeer {
   struct ncclConnector send[NCCL_MAX_CONNS];
   struct ncclConnector recv[NCCL_MAX_CONNS];
+  int refCount;
 };
 
 struct ncclDevComm;
@@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
 #endif
 
 struct alignas(16) ncclDevChannel {
-  struct ncclDevChannelPeer *peers;
+  struct ncclDevChannelPeer** peers;
   struct ncclRing ring;
   struct ncclTree tree;
   struct ncclTree collnetChain;
diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h
index c83a2292ae..f532a705e1 100644
--- a/src/include/gdrwrap.h
+++ b/src/include/gdrwrap.h
@@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
   gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
   NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
   NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
-  CUDACHECK(cudaFree(md->gdrDevMem));
+  NCCLCHECK(ncclCudaFree(md->gdrDevMem));
   free(md);
 
   return ncclSuccess;
diff --git a/src/include/graph.h b/src/include/graph.h
index 38b17d5113..69726e08de 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_ROME 4
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
-ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
+ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
 
 #define NCCL_TOPO_MAX_NODES 256
 
@@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 #define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
 #define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
+#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
 struct ncclTopoGraph {
   // Input / output
   int id; // ring : 0, tree : 1, collnet : 2
@@ -108,18 +111,16 @@ struct ncclTopoRanks {
   int treeToParent[MAXCHANNELS];
   int treeToChild0[MAXCHANNELS];
   int treeToChild1[MAXCHANNELS];
+  int nvlsHeads[MAXCHANNELS];
 };
 
-ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
-    struct ncclTopoRanks* topoRanks);
+ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
 
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
-    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
-
+    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
 ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
 
-ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
 #include "info.h"
 ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
 
diff --git a/src/include/group.h b/src/include/group.h
index dd0e642d22..a92c348863 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -36,6 +36,7 @@ struct ncclAsyncJob {
   void(*destructor)(void*);
   ncclGroupJobState_t state;
   volatile uint32_t *abortFlag; /* point to comm abortFlag */
+  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
   ncclComm_t comm;
 };
 
@@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError;
 extern __thread struct ncclComm* ncclGroupCommHead;
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
 extern __thread int ncclGroupBlocking;
+extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
+extern __thread struct ncclGroupJob ncclGroupJobMain;
+
+static inline void groupResetJobState() {
+  ncclGroupBlocking = -1;
+  ncclGroupJobMainPtr = NULL;
+  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
+  return;
+}
+
+static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
+  ncclResult_t ret = ncclSuccess;
+  if (job) {
+    ret = ncclAsyncJobComplete(&job->base);
+    groupResetJobState();
+  }
+  return ret;
+}
 
 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
   if (ncclGroupDepth > 0) {
@@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     ncclMemoryStackPush(&comm->memScoped);
   }
 
-  ncclGroupBlocking = comm->blocking;
+  ncclGroupBlocking = comm->config.blocking;
 }
 
 // Add comm to this thread's group needing preconnect
diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h
new file mode 100644
index 0000000000..00a6b6f60b
--- /dev/null
+++ b/src/include/ibvcore.h
@@ -0,0 +1,1043 @@
+#ifndef NCCL_IBV_CORE_H_
+#define NCCL_IBV_CORE_H_
+
+/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without
+ * explicit including of IB verbs header.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+#ifndef container_of
+/**
+  * container_of - cast a member of a structure out to the containing structure
+  * @ptr:        the pointer to the member.
+  * @type:       the type of the container struct this is embedded in.
+  * @member:     the name of the member within the struct.
+  *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC,
+
+	/* Leave a gap for future node types before starting with
+	 * experimental node types.
+	 */
+	IBV_EXP_NODE_TYPE_START	= 32,
+	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP,
+
+	/* Leave a gap for future transport types before starting with
+	 * experimental transport types.
+	 */
+	IBV_EXP_TRANSPORT_TYPE_START	= 32,
+	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_XRC			= 1 << 20,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+
+	/* Leave a gap for future link layer types before starting with
+	 * experimental link layer.
+	 */
+	IBV_EXP_LINK_LAYER_START	= 32,
+	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
+};
+
+enum ibv_port_cap_flags {
+	IBV_PORT_SM				= 1 <<  1,
+	IBV_PORT_NOTICE_SUP			= 1 <<  2,
+	IBV_PORT_TRAP_SUP			= 1 <<  3,
+	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
+	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
+	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
+	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
+	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
+	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	IBV_PORT_CM_SUP				= 1 << 16,
+	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	IBV_PORT_REINIT_SUP			= 1 << 18,
+	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	IBV_PORT_VENDOR_CLASS			= 1 << 24,
+	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+
+	/* new experimental events start here leaving enough
+	 * room for 14 events which should be enough
+	 */
+	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
+	IBV_EXP_EVENT_DCT_ACCESS_ERR,
+	IBV_EXP_EVENT_DCT_REQ_ERR,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		struct ibv_exp_dct *dct;
+		int		port_num;
+		/* For source compatible with Legacy API */
+		uint32_t	xrc_qp_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_xrcd_init_attr_mask {
+	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
+	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
+	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
+};
+
+struct ibv_xrcd_init_attr {
+	uint32_t comp_mask;
+	int	 fd;
+	int	 oflags;
+};
+
+struct ibv_xrcd {
+	struct ibv_context     *context;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_srq_type {
+	IBV_SRQT_BASIC,
+	IBV_SRQT_XRC
+};
+
+enum ibv_srq_init_attr_mask {
+	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
+	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
+	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
+	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
+	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_srq_init_attr_ex {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+
+	uint32_t		comp_mask;
+	enum ibv_srq_type	srq_type;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+	struct ibv_cq	       *cq;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	/* XRC compatible code */
+	IBV_QPT_XRC,
+	IBV_QPT_RAW_PACKET = 8,
+	IBV_QPT_RAW_ETH = 8,
+	IBV_QPT_XRC_SEND = 9,
+	IBV_QPT_XRC_RECV,
+
+	/* Leave a gap for future qp types before starting with
+	 * experimental qp types.
+	 */
+	IBV_EXP_QP_TYPE_START	= 32,
+	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+	/* Below is needed for backwards compatabile */
+	struct ibv_xrc_domain  *xrc_domain;
+};
+
+enum ibv_qp_init_attr_mask {
+	IBV_QP_INIT_ATTR_PD		= 1 << 0,
+	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
+	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
+};
+
+struct ibv_qp_init_attr_ex {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+
+	uint32_t		comp_mask;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+};
+
+enum ibv_qp_open_attr_mask {
+	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
+	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
+	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
+	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
+	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_qp_open_attr {
+	uint32_t		comp_mask;
+	uint32_t		qp_num;
+	struct ibv_xrcd        *xrcd;
+	void		       *qp_context;
+	enum ibv_qp_type	qp_type;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR,
+	IBV_QPS_UNKNOWN
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+	union {
+		union {
+			struct {
+				uint32_t    remote_srqn;
+			} xrc;
+		} qp_type;
+
+		uint32_t		xrc_remote_srq_num;
+	};
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+
+	/* below are for source compatabilty with legacy XRC,
+	*   padding based on ibv_srq_legacy.
+	*/
+	uint32_t		xrc_srq_num_bin_compat_padding;
+	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
+	struct ibv_cq	*xrc_cq_bin_compat_padding;
+	void		*ibv_srq_padding;
+
+	/* legacy fields */
+	uint32_t		xrc_srq_num;
+	struct ibv_xrc_domain	*xrc_domain;
+	struct ibv_cq		*xrc_cq;
+};
+
+/* Not in use in new API, needed for compilation as part of source compat layer */
+enum ibv_event_flags {
+	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+enum ibv_flow_flags {
+	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
+};
+
+enum ibv_flow_attr_type {
+	/* steering according to rule specifications */
+	IBV_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
+};
+
+enum ibv_flow_spec_type {
+	IBV_FLOW_SPEC_ETH	= 0x20,
+	IBV_FLOW_SPEC_IPV4	= 0x30,
+	IBV_FLOW_SPEC_TCP	= 0x40,
+	IBV_FLOW_SPEC_UDP	= 0x41,
+};
+
+struct ibv_flow_eth_filter {
+	uint8_t		dst_mac[6];
+	uint8_t		src_mac[6];
+	uint16_t	ether_type;
+	/*
+	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+	 */
+	uint16_t	vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_eth_filter val;
+	struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+	uint32_t src_ip;
+	uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_ipv4_filter val;
+	struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+	uint16_t dst_port;
+	uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_tcp_udp_filter val;
+	struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+	union {
+		struct {
+			enum ibv_flow_spec_type	type;
+			uint16_t		size;
+		} hdr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_ipv4 ipv4;
+		struct ibv_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ibv_flow_attr {
+	uint32_t comp_mask;
+	enum ibv_flow_attr_type type;
+	uint16_t size;
+	uint16_t priority;
+	uint8_t num_of_specs;
+	uint8_t port;
+	uint32_t flags;
+	/* Following are the optional layers according to user request
+	 * struct ibv_flow_spec_xxx [L2]
+	 * struct ibv_flow_spec_yyy [L3/L4]
+	 */
+};
+
+struct ibv_flow {
+	uint32_t	   comp_mask;
+	struct ibv_context *context;
+	uint32_t	   handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct verbs_device {
+	struct ibv_device device; /* Must be first */
+	size_t	sz;
+	size_t	size_of_context;
+	int	(*init_context)(struct verbs_device *device,
+				struct ibv_context *ctx, int cmd_fd);
+	void	(*uninit_context)(struct verbs_device *device,
+				struct ibv_context *ctx);
+	/* future fields added here */
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+};
+
+enum verbs_context_mask {
+	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
+	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
+	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
+	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
+	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
+};
+
+struct verbs_context {
+	/*  "grows up" - new fields go here */
+	int (*_reserved_2) (void);
+	int (*destroy_flow) (struct ibv_flow *flow);
+	int (*_reserved_1) (void);
+	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
+					  struct ibv_flow_attr *flow_attr);
+	struct ibv_qp * (*open_qp)(struct ibv_context *context,
+			struct ibv_qp_open_attr *attr);
+	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
+			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
+	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
+			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
+			struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t has_comp_mask;
+	size_t   sz;	/* Must be immediately before struct ibv_context */
+	struct ibv_context context;/* Must be last field in the struct */
+};
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+{
+	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
+		NULL : container_of(ctx, struct verbs_context, context);
+}
+
+#define verbs_get_ctx_op(ctx, op) ({ \
+	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
+	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
+	!_vctx->op) ? NULL : _vctx; })*/
+
+#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
+	struct verbs_context *vctx = _vctx; \
+	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
+		vctx->op = ptr; })
+
+static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
+{
+	return (dev->ops.alloc_context) ?
+		NULL : container_of(dev, struct verbs_device, device);
+}
+
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+#endif  // NCCL_IBV_CORE_H_
diff --git a/src/include/ibvsymbols.h b/src/include/ibvsymbols.h
new file mode 100644
index 0000000000..7cf1e08d8c
--- /dev/null
+++ b/src/include/ibvsymbols.h
@@ -0,0 +1,44 @@
+#ifndef NCCL_IBV_SYMBOLS_H_
+#define NCCL_IBV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* IB Verbs Function Pointers*/
+struct ncclIbvSymbols {
+  int (*ibv_internal_fork_init)(void);
+  struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+  void (*ibv_internal_free_device_list)(struct ibv_device **list);
+  const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+  struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+  int (*ibv_internal_close_device)(struct ibv_context *context);
+  int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+  void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+  int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+  int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+  int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+  int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+  struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+  int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+  struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+  struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
+  /* DMA-BUF support */
+  struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+  int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+  struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+  int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+  struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+};
+
+/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
+
+#endif  // NCCL_IBV_SYMBOLS_H_
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index c7475890a7..d1c7d08e71 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -12,1044 +12,23 @@
 #ifndef NCCL_IBVWRAP_H_
 #define NCCL_IBVWRAP_H_
 
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
 #include "core.h"
 #include <sys/types.h>
 #include <unistd.h>
 
-// Dynamically handle dependencies on IB verbs
-
-#if __GNUC__ >= 3
-#  define __attribute_const __attribute__((const))
-#else
-#  define __attribute_const
-#endif
-
-union ibv_gid {
-	uint8_t			raw[16];
-	struct {
-		uint64_t	subnet_prefix;
-		uint64_t	interface_id;
-	} global;
-};
-
-#ifndef container_of
-/**
-  * container_of - cast a member of a structure out to the containing structure
-  * @ptr:        the pointer to the member.
-  * @type:       the type of the container struct this is embedded in.
-  * @member:     the name of the member within the struct.
-  *
- */
-#define container_of(ptr, type, member) \
-	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
-#endif
-
-#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
-
-/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
-
-enum ibv_node_type {
-	IBV_NODE_UNKNOWN	= -1,
-	IBV_NODE_CA 		= 1,
-	IBV_NODE_SWITCH,
-	IBV_NODE_ROUTER,
-	IBV_NODE_RNIC,
-
-	/* Leave a gap for future node types before starting with
-	 * experimental node types.
-	 */
-	IBV_EXP_NODE_TYPE_START	= 32,
-	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
-};
-
-enum ibv_transport_type {
-	IBV_TRANSPORT_UNKNOWN	= -1,
-	IBV_TRANSPORT_IB	= 0,
-	IBV_TRANSPORT_IWARP,
-
-	/* Leave a gap for future transport types before starting with
-	 * experimental transport types.
-	 */
-	IBV_EXP_TRANSPORT_TYPE_START	= 32,
-	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
-};
-
-enum ibv_device_cap_flags {
-	IBV_DEVICE_RESIZE_MAX_WR	= 1,
-	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
-	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
-	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
-	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
-	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
-	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
-	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
-	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
-	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
-	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
-	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
-	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
-	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
-	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
-	IBV_DEVICE_XRC			= 1 << 20,
-	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
-};
-
-enum ibv_atomic_cap {
-	IBV_ATOMIC_NONE,
-	IBV_ATOMIC_HCA,
-	IBV_ATOMIC_GLOB
-};
-
-struct ibv_device_attr {
-	char			fw_ver[64];
-	uint64_t		node_guid;
-	uint64_t		sys_image_guid;
-	uint64_t		max_mr_size;
-	uint64_t		page_size_cap;
-	uint32_t		vendor_id;
-	uint32_t		vendor_part_id;
-	uint32_t		hw_ver;
-	int			max_qp;
-	int			max_qp_wr;
-	int			device_cap_flags;
-	int			max_sge;
-	int			max_sge_rd;
-	int			max_cq;
-	int			max_cqe;
-	int			max_mr;
-	int			max_pd;
-	int			max_qp_rd_atom;
-	int			max_ee_rd_atom;
-	int			max_res_rd_atom;
-	int			max_qp_init_rd_atom;
-	int			max_ee_init_rd_atom;
-	enum ibv_atomic_cap	atomic_cap;
-	int			max_ee;
-	int			max_rdd;
-	int			max_mw;
-	int			max_raw_ipv6_qp;
-	int			max_raw_ethy_qp;
-	int			max_mcast_grp;
-	int			max_mcast_qp_attach;
-	int			max_total_mcast_qp_attach;
-	int			max_ah;
-	int			max_fmr;
-	int			max_map_per_fmr;
-	int			max_srq;
-	int			max_srq_wr;
-	int			max_srq_sge;
-	uint16_t		max_pkeys;
-	uint8_t			local_ca_ack_delay;
-	uint8_t			phys_port_cnt;
-};
-
-enum ibv_mtu {
-	IBV_MTU_256  = 1,
-	IBV_MTU_512  = 2,
-	IBV_MTU_1024 = 3,
-	IBV_MTU_2048 = 4,
-	IBV_MTU_4096 = 5
-};
-
-enum ibv_port_state {
-	IBV_PORT_NOP		= 0,
-	IBV_PORT_DOWN		= 1,
-	IBV_PORT_INIT		= 2,
-	IBV_PORT_ARMED		= 3,
-	IBV_PORT_ACTIVE		= 4,
-	IBV_PORT_ACTIVE_DEFER	= 5
-};
-
-enum {
-	IBV_LINK_LAYER_UNSPECIFIED,
-	IBV_LINK_LAYER_INFINIBAND,
-	IBV_LINK_LAYER_ETHERNET,
-
-	/* Leave a gap for future link layer types before starting with
-	 * experimental link layer.
-	 */
-	IBV_EXP_LINK_LAYER_START	= 32,
-	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
-};
-
-enum ibv_port_cap_flags {
-	IBV_PORT_SM				= 1 <<  1,
-	IBV_PORT_NOTICE_SUP			= 1 <<  2,
-	IBV_PORT_TRAP_SUP			= 1 <<  3,
-	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
-	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
-	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
-	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
-	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
-	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
-	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
-	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
-	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
-	IBV_PORT_CM_SUP				= 1 << 16,
-	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
-	IBV_PORT_REINIT_SUP			= 1 << 18,
-	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
-	IBV_PORT_VENDOR_CLASS			= 1 << 24,
-	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
-	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
-};
-
-struct ibv_port_attr {
-	enum ibv_port_state	state;
-	enum ibv_mtu		max_mtu;
-	enum ibv_mtu		active_mtu;
-	int			gid_tbl_len;
-	uint32_t		port_cap_flags;
-	uint32_t		max_msg_sz;
-	uint32_t		bad_pkey_cntr;
-	uint32_t		qkey_viol_cntr;
-	uint16_t		pkey_tbl_len;
-	uint16_t		lid;
-	uint16_t		sm_lid;
-	uint8_t			lmc;
-	uint8_t			max_vl_num;
-	uint8_t			sm_sl;
-	uint8_t			subnet_timeout;
-	uint8_t			init_type_reply;
-	uint8_t			active_width;
-	uint8_t			active_speed;
-	uint8_t			phys_state;
-	uint8_t			link_layer;
-	uint8_t			reserved;
-};
-
-enum ibv_event_type {
-	IBV_EVENT_CQ_ERR,
-	IBV_EVENT_QP_FATAL,
-	IBV_EVENT_QP_REQ_ERR,
-	IBV_EVENT_QP_ACCESS_ERR,
-	IBV_EVENT_COMM_EST,
-	IBV_EVENT_SQ_DRAINED,
-	IBV_EVENT_PATH_MIG,
-	IBV_EVENT_PATH_MIG_ERR,
-	IBV_EVENT_DEVICE_FATAL,
-	IBV_EVENT_PORT_ACTIVE,
-	IBV_EVENT_PORT_ERR,
-	IBV_EVENT_LID_CHANGE,
-	IBV_EVENT_PKEY_CHANGE,
-	IBV_EVENT_SM_CHANGE,
-	IBV_EVENT_SRQ_ERR,
-	IBV_EVENT_SRQ_LIMIT_REACHED,
-	IBV_EVENT_QP_LAST_WQE_REACHED,
-	IBV_EVENT_CLIENT_REREGISTER,
-	IBV_EVENT_GID_CHANGE,
-
-	/* new experimental events start here leaving enough
-	 * room for 14 events which should be enough
-	 */
-	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
-	IBV_EXP_EVENT_DCT_ACCESS_ERR,
-	IBV_EXP_EVENT_DCT_REQ_ERR,
-};
-
-struct ibv_async_event {
-	union {
-		struct ibv_cq  *cq;
-		struct ibv_qp  *qp;
-		struct ibv_srq *srq;
-		struct ibv_exp_dct *dct;
-		int		port_num;
-		/* For source compatible with Legacy API */
-		uint32_t	xrc_qp_num;
-	} element;
-	enum ibv_event_type	event_type;
-};
-
-enum ibv_wc_status {
-	IBV_WC_SUCCESS,
-	IBV_WC_LOC_LEN_ERR,
-	IBV_WC_LOC_QP_OP_ERR,
-	IBV_WC_LOC_EEC_OP_ERR,
-	IBV_WC_LOC_PROT_ERR,
-	IBV_WC_WR_FLUSH_ERR,
-	IBV_WC_MW_BIND_ERR,
-	IBV_WC_BAD_RESP_ERR,
-	IBV_WC_LOC_ACCESS_ERR,
-	IBV_WC_REM_INV_REQ_ERR,
-	IBV_WC_REM_ACCESS_ERR,
-	IBV_WC_REM_OP_ERR,
-	IBV_WC_RETRY_EXC_ERR,
-	IBV_WC_RNR_RETRY_EXC_ERR,
-	IBV_WC_LOC_RDD_VIOL_ERR,
-	IBV_WC_REM_INV_RD_REQ_ERR,
-	IBV_WC_REM_ABORT_ERR,
-	IBV_WC_INV_EECN_ERR,
-	IBV_WC_INV_EEC_STATE_ERR,
-	IBV_WC_FATAL_ERR,
-	IBV_WC_RESP_TIMEOUT_ERR,
-	IBV_WC_GENERAL_ERR
-};
-const char *ibv_wc_status_str(enum ibv_wc_status status);
-
-enum ibv_wc_opcode {
-	IBV_WC_SEND,
-	IBV_WC_RDMA_WRITE,
-	IBV_WC_RDMA_READ,
-	IBV_WC_COMP_SWAP,
-	IBV_WC_FETCH_ADD,
-	IBV_WC_BIND_MW,
-/*
- * Set value of IBV_WC_RECV so consumers can test if a completion is a
- * receive by testing (opcode & IBV_WC_RECV).
- */
-	IBV_WC_RECV			= 1 << 7,
-	IBV_WC_RECV_RDMA_WITH_IMM
-};
-
-enum ibv_wc_flags {
-	IBV_WC_GRH		= 1 << 0,
-	IBV_WC_WITH_IMM		= 1 << 1
-};
-
-struct ibv_wc {
-	uint64_t		wr_id;
-	enum ibv_wc_status	status;
-	enum ibv_wc_opcode	opcode;
-	uint32_t		vendor_err;
-	uint32_t		byte_len;
-	uint32_t		imm_data;	/* in network byte order */
-	uint32_t		qp_num;
-	uint32_t		src_qp;
-	int			wc_flags;
-	uint16_t		pkey_index;
-	uint16_t		slid;
-	uint8_t			sl;
-	uint8_t			dlid_path_bits;
-};
-
-enum ibv_access_flags {
-	IBV_ACCESS_LOCAL_WRITE		= 1,
-	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
-	IBV_ACCESS_REMOTE_READ		= (1<<2),
-	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IBV_ACCESS_MW_BIND		= (1<<4),
-	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
-};
-
-struct ibv_pd {
-	struct ibv_context     *context;
-	uint32_t		handle;
-};
-
-enum ibv_xrcd_init_attr_mask {
-	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
-	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
-	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
-};
-
-struct ibv_xrcd_init_attr {
-	uint32_t comp_mask;
-	int	 fd;
-	int	 oflags;
-};
-
-struct ibv_xrcd {
-	struct ibv_context     *context;
-};
-
-enum ibv_rereg_mr_flags {
-	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
-	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
-	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
-	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
-};
-
-struct ibv_mr {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	void		       *addr;
-	size_t			length;
-	uint32_t		handle;
-	uint32_t		lkey;
-	uint32_t		rkey;
-};
-
-enum ibv_mw_type {
-	IBV_MW_TYPE_1			= 1,
-	IBV_MW_TYPE_2			= 2
-};
-
-struct ibv_mw {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	uint32_t		rkey;
-};
-
-struct ibv_global_route {
-	union ibv_gid		dgid;
-	uint32_t		flow_label;
-	uint8_t			sgid_index;
-	uint8_t			hop_limit;
-	uint8_t			traffic_class;
-};
-
-struct ibv_grh {
-	uint32_t		version_tclass_flow;
-	uint16_t		paylen;
-	uint8_t			next_hdr;
-	uint8_t			hop_limit;
-	union ibv_gid		sgid;
-	union ibv_gid		dgid;
-};
-
-enum ibv_rate {
-	IBV_RATE_MAX      = 0,
-	IBV_RATE_2_5_GBPS = 2,
-	IBV_RATE_5_GBPS   = 5,
-	IBV_RATE_10_GBPS  = 3,
-	IBV_RATE_20_GBPS  = 6,
-	IBV_RATE_30_GBPS  = 4,
-	IBV_RATE_40_GBPS  = 7,
-	IBV_RATE_60_GBPS  = 8,
-	IBV_RATE_80_GBPS  = 9,
-	IBV_RATE_120_GBPS = 10,
-	IBV_RATE_14_GBPS  = 11,
-	IBV_RATE_56_GBPS  = 12,
-	IBV_RATE_112_GBPS = 13,
-	IBV_RATE_168_GBPS = 14,
-	IBV_RATE_25_GBPS  = 15,
-	IBV_RATE_100_GBPS = 16,
-	IBV_RATE_200_GBPS = 17,
-	IBV_RATE_300_GBPS = 18
-};
-
-/**
- * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
- * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
- * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
- * @rate: rate to convert.
- */
-int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
-
-/**
- * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
- * @mult: multiple to convert.
- */
-enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
-
-/**
- * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
- * For example, IBV_RATE_5_GBPS will return the value 5000.
- * @rate: rate to convert.
- */
-int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
-
-/**
- * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
- * @mbps: value to convert.
- */
-enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
-
-struct ibv_ah_attr {
-	struct ibv_global_route	grh;
-	uint16_t		dlid;
-	uint8_t			sl;
-	uint8_t			src_path_bits;
-	uint8_t			static_rate;
-	uint8_t			is_global;
-	uint8_t			port_num;
-};
-
-enum ibv_srq_attr_mask {
-	IBV_SRQ_MAX_WR	= 1 << 0,
-	IBV_SRQ_LIMIT	= 1 << 1
-};
-
-struct ibv_srq_attr {
-	uint32_t		max_wr;
-	uint32_t		max_sge;
-	uint32_t		srq_limit;
-};
-
-struct ibv_srq_init_attr {
-	void		       *srq_context;
-	struct ibv_srq_attr	attr;
-};
-
-enum ibv_srq_type {
-	IBV_SRQT_BASIC,
-	IBV_SRQT_XRC
-};
-
-enum ibv_srq_init_attr_mask {
-	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
-	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
-	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
-	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
-	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
-};
-
-struct ibv_srq_init_attr_ex {
-	void		       *srq_context;
-	struct ibv_srq_attr	attr;
-
-	uint32_t		comp_mask;
-	enum ibv_srq_type	srq_type;
-	struct ibv_pd	       *pd;
-	struct ibv_xrcd	       *xrcd;
-	struct ibv_cq	       *cq;
-};
-
-enum ibv_qp_type {
-	IBV_QPT_RC = 2,
-	IBV_QPT_UC,
-	IBV_QPT_UD,
-	/* XRC compatible code */
-	IBV_QPT_XRC,
-	IBV_QPT_RAW_PACKET = 8,
-	IBV_QPT_RAW_ETH = 8,
-	IBV_QPT_XRC_SEND = 9,
-	IBV_QPT_XRC_RECV,
-
-	/* Leave a gap for future qp types before starting with
-	 * experimental qp types.
-	 */
-	IBV_EXP_QP_TYPE_START	= 32,
-	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
-};
-
-struct ibv_qp_cap {
-	uint32_t		max_send_wr;
-	uint32_t		max_recv_wr;
-	uint32_t		max_send_sge;
-	uint32_t		max_recv_sge;
-	uint32_t		max_inline_data;
-};
-
-struct ibv_qp_init_attr {
-	void		       *qp_context;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	struct ibv_qp_cap	cap;
-	enum ibv_qp_type	qp_type;
-	int			sq_sig_all;
-	/* Below is needed for backwards compatabile */
-	struct ibv_xrc_domain  *xrc_domain;
-};
-
-enum ibv_qp_init_attr_mask {
-	IBV_QP_INIT_ATTR_PD		= 1 << 0,
-	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
-	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
-};
-
-struct ibv_qp_init_attr_ex {
-	void		       *qp_context;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	struct ibv_qp_cap	cap;
-	enum ibv_qp_type	qp_type;
-	int			sq_sig_all;
-
-	uint32_t		comp_mask;
-	struct ibv_pd	       *pd;
-	struct ibv_xrcd	       *xrcd;
-};
-
-enum ibv_qp_open_attr_mask {
-	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
-	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
-	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
-	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
-	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
-};
-
-struct ibv_qp_open_attr {
-	uint32_t		comp_mask;
-	uint32_t		qp_num;
-	struct ibv_xrcd        *xrcd;
-	void		       *qp_context;
-	enum ibv_qp_type	qp_type;
-};
-
-enum ibv_qp_attr_mask {
-	IBV_QP_STATE			= 1 << 	0,
-	IBV_QP_CUR_STATE		= 1 << 	1,
-	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
-	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
-	IBV_QP_PKEY_INDEX		= 1 << 	4,
-	IBV_QP_PORT			= 1 << 	5,
-	IBV_QP_QKEY			= 1 << 	6,
-	IBV_QP_AV			= 1 << 	7,
-	IBV_QP_PATH_MTU			= 1 << 	8,
-	IBV_QP_TIMEOUT			= 1 << 	9,
-	IBV_QP_RETRY_CNT		= 1 << 10,
-	IBV_QP_RNR_RETRY		= 1 << 11,
-	IBV_QP_RQ_PSN			= 1 << 12,
-	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
-	IBV_QP_ALT_PATH			= 1 << 14,
-	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
-	IBV_QP_SQ_PSN			= 1 << 16,
-	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
-	IBV_QP_PATH_MIG_STATE		= 1 << 18,
-	IBV_QP_CAP			= 1 << 19,
-	IBV_QP_DEST_QPN			= 1 << 20
-};
-
-enum ibv_qp_state {
-	IBV_QPS_RESET,
-	IBV_QPS_INIT,
-	IBV_QPS_RTR,
-	IBV_QPS_RTS,
-	IBV_QPS_SQD,
-	IBV_QPS_SQE,
-	IBV_QPS_ERR,
-	IBV_QPS_UNKNOWN
-};
-
-enum ibv_mig_state {
-	IBV_MIG_MIGRATED,
-	IBV_MIG_REARM,
-	IBV_MIG_ARMED
-};
-
-struct ibv_qp_attr {
-	enum ibv_qp_state	qp_state;
-	enum ibv_qp_state	cur_qp_state;
-	enum ibv_mtu		path_mtu;
-	enum ibv_mig_state	path_mig_state;
-	uint32_t		qkey;
-	uint32_t		rq_psn;
-	uint32_t		sq_psn;
-	uint32_t		dest_qp_num;
-	int			qp_access_flags;
-	struct ibv_qp_cap	cap;
-	struct ibv_ah_attr	ah_attr;
-	struct ibv_ah_attr	alt_ah_attr;
-	uint16_t		pkey_index;
-	uint16_t		alt_pkey_index;
-	uint8_t			en_sqd_async_notify;
-	uint8_t			sq_draining;
-	uint8_t			max_rd_atomic;
-	uint8_t			max_dest_rd_atomic;
-	uint8_t			min_rnr_timer;
-	uint8_t			port_num;
-	uint8_t			timeout;
-	uint8_t			retry_cnt;
-	uint8_t			rnr_retry;
-	uint8_t			alt_port_num;
-	uint8_t			alt_timeout;
-};
-
-enum ibv_wr_opcode {
-	IBV_WR_RDMA_WRITE,
-	IBV_WR_RDMA_WRITE_WITH_IMM,
-	IBV_WR_SEND,
-	IBV_WR_SEND_WITH_IMM,
-	IBV_WR_RDMA_READ,
-	IBV_WR_ATOMIC_CMP_AND_SWP,
-	IBV_WR_ATOMIC_FETCH_AND_ADD
-};
-
-enum ibv_send_flags {
-	IBV_SEND_FENCE		= 1 << 0,
-	IBV_SEND_SIGNALED	= 1 << 1,
-	IBV_SEND_SOLICITED	= 1 << 2,
-	IBV_SEND_INLINE		= 1 << 3
-};
-
-struct ibv_sge {
-	uint64_t		addr;
-	uint32_t		length;
-	uint32_t		lkey;
-};
-
-struct ibv_send_wr {
-	uint64_t		wr_id;
-	struct ibv_send_wr     *next;
-	struct ibv_sge	       *sg_list;
-	int			num_sge;
-	enum ibv_wr_opcode	opcode;
-	int			send_flags;
-	uint32_t		imm_data;	/* in network byte order */
-	union {
-		struct {
-			uint64_t	remote_addr;
-			uint32_t	rkey;
-		} rdma;
-		struct {
-			uint64_t	remote_addr;
-			uint64_t	compare_add;
-			uint64_t	swap;
-			uint32_t	rkey;
-		} atomic;
-		struct {
-			struct ibv_ah  *ah;
-			uint32_t	remote_qpn;
-			uint32_t	remote_qkey;
-		} ud;
-	} wr;
-	union {
-		union {
-			struct {
-				uint32_t    remote_srqn;
-			} xrc;
-		} qp_type;
-
-		uint32_t		xrc_remote_srq_num;
-	};
-};
-
-struct ibv_recv_wr {
-	uint64_t		wr_id;
-	struct ibv_recv_wr     *next;
-	struct ibv_sge	       *sg_list;
-	int			num_sge;
-};
-
-struct ibv_mw_bind {
-	uint64_t		wr_id;
-	struct ibv_mr	       *mr;
-	void		       *addr;
-	size_t			length;
-	int			send_flags;
-	int			mw_access_flags;
-};
-
-struct ibv_srq {
-	struct ibv_context     *context;
-	void		       *srq_context;
-	struct ibv_pd	       *pd;
-	uint32_t		handle;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		events_completed;
-
-	/* below are for source compatabilty with legacy XRC,
-	*   padding based on ibv_srq_legacy.
-	*/
-	uint32_t		xrc_srq_num_bin_compat_padding;
-	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
-	struct ibv_cq	*xrc_cq_bin_compat_padding;
-	void		*ibv_srq_padding;
-
-	/* legacy fields */
-	uint32_t		xrc_srq_num;
-	struct ibv_xrc_domain	*xrc_domain;
-	struct ibv_cq		*xrc_cq;
-};
-
-/* Not in use in new API, needed for compilation as part of source compat layer */
-enum ibv_event_flags {
-	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
-};
-
-
-
-struct ibv_qp {
-	struct ibv_context     *context;
-	void		       *qp_context;
-	struct ibv_pd	       *pd;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	uint32_t		handle;
-	uint32_t		qp_num;
-	enum ibv_qp_state       state;
-	enum ibv_qp_type	qp_type;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		events_completed;
-};
-
-struct ibv_comp_channel {
-	struct ibv_context     *context;
-	int			fd;
-	int			refcnt;
-};
-
-struct ibv_cq {
-	struct ibv_context     *context;
-	struct ibv_comp_channel *channel;
-	void		       *cq_context;
-	uint32_t		handle;
-	int			cqe;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		comp_events_completed;
-	uint32_t		async_events_completed;
-};
-
-struct ibv_ah {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	uint32_t		handle;
-};
-
-enum ibv_flow_flags {
-	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
-	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
-};
-
-enum ibv_flow_attr_type {
-	/* steering according to rule specifications */
-	IBV_FLOW_ATTR_NORMAL		= 0x0,
-	/* default unicast and multicast rule -
-	 * receive all Eth traffic which isn't steered to any QP
-	 */
-	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
-	/* default multicast rule -
-	 * receive all Eth multicast traffic which isn't steered to any QP
-	 */
-	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
-};
-
-enum ibv_flow_spec_type {
-	IBV_FLOW_SPEC_ETH	= 0x20,
-	IBV_FLOW_SPEC_IPV4	= 0x30,
-	IBV_FLOW_SPEC_TCP	= 0x40,
-	IBV_FLOW_SPEC_UDP	= 0x41,
-};
-
-struct ibv_flow_eth_filter {
-	uint8_t		dst_mac[6];
-	uint8_t		src_mac[6];
-	uint16_t	ether_type;
-	/*
-	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
-	 */
-	uint16_t	vlan_tag;
-};
-
-struct ibv_flow_spec_eth {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_eth_filter val;
-	struct ibv_flow_eth_filter mask;
-};
-
-struct ibv_flow_ipv4_filter {
-	uint32_t src_ip;
-	uint32_t dst_ip;
-};
-
-struct ibv_flow_spec_ipv4 {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_ipv4_filter val;
-	struct ibv_flow_ipv4_filter mask;
-};
-
-struct ibv_flow_tcp_udp_filter {
-	uint16_t dst_port;
-	uint16_t src_port;
-};
-
-struct ibv_flow_spec_tcp_udp {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_tcp_udp_filter val;
-	struct ibv_flow_tcp_udp_filter mask;
-};
-
-struct ibv_flow_spec {
-	union {
-		struct {
-			enum ibv_flow_spec_type	type;
-			uint16_t		size;
-		} hdr;
-		struct ibv_flow_spec_eth eth;
-		struct ibv_flow_spec_ipv4 ipv4;
-		struct ibv_flow_spec_tcp_udp tcp_udp;
-	};
-};
-
-struct ibv_flow_attr {
-	uint32_t comp_mask;
-	enum ibv_flow_attr_type type;
-	uint16_t size;
-	uint16_t priority;
-	uint8_t num_of_specs;
-	uint8_t port;
-	uint32_t flags;
-	/* Following are the optional layers according to user request
-	 * struct ibv_flow_spec_xxx [L2]
-	 * struct ibv_flow_spec_yyy [L3/L4]
-	 */
-};
-
-struct ibv_flow {
-	uint32_t	   comp_mask;
-	struct ibv_context *context;
-	uint32_t	   handle;
-};
-
-struct ibv_device;
-struct ibv_context;
-
-struct ibv_device_ops {
-	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
-	void			(*free_context)(struct ibv_context *context);
-};
-
-enum {
-	IBV_SYSFS_NAME_MAX	= 64,
-	IBV_SYSFS_PATH_MAX	= 256
-};
-
-struct ibv_device {
-	struct ibv_device_ops	ops;
-	enum ibv_node_type	node_type;
-	enum ibv_transport_type	transport_type;
-	/* Name of underlying kernel IB device, eg "mthca0" */
-	char			name[IBV_SYSFS_NAME_MAX];
-	/* Name of uverbs device, eg "uverbs0" */
-	char			dev_name[IBV_SYSFS_NAME_MAX];
-	/* Path to infiniband_verbs class device in sysfs */
-	char			dev_path[IBV_SYSFS_PATH_MAX];
-	/* Path to infiniband class device in sysfs */
-	char			ibdev_path[IBV_SYSFS_PATH_MAX];
-};
-
-struct verbs_device {
-	struct ibv_device device; /* Must be first */
-	size_t	sz;
-	size_t	size_of_context;
-	int	(*init_context)(struct verbs_device *device,
-				struct ibv_context *ctx, int cmd_fd);
-	void	(*uninit_context)(struct verbs_device *device,
-				struct ibv_context *ctx);
-	/* future fields added here */
-};
-
-struct ibv_context_ops {
-	int			(*query_device)(struct ibv_context *context,
-					      struct ibv_device_attr *device_attr);
-	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
-					      struct ibv_port_attr *port_attr);
-	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
-	int			(*dealloc_pd)(struct ibv_pd *pd);
-	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
-					  int access);
-	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
-					    int flags,
-					    struct ibv_pd *pd, void *addr,
-					    size_t length,
-					    int access);
-	int			(*dereg_mr)(struct ibv_mr *mr);
-	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
-	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
-					   struct ibv_mw_bind *mw_bind);
-	int			(*dealloc_mw)(struct ibv_mw *mw);
-	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
-					     struct ibv_comp_channel *channel,
-					     int comp_vector);
-	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
-	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
-	void			(*cq_event)(struct ibv_cq *cq);
-	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
-	int			(*destroy_cq)(struct ibv_cq *cq);
-	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
-					      struct ibv_srq_init_attr *srq_init_attr);
-	int			(*modify_srq)(struct ibv_srq *srq,
-					      struct ibv_srq_attr *srq_attr,
-					      int srq_attr_mask);
-	int			(*query_srq)(struct ibv_srq *srq,
-					     struct ibv_srq_attr *srq_attr);
-	int			(*destroy_srq)(struct ibv_srq *srq);
-	int			(*post_srq_recv)(struct ibv_srq *srq,
-						 struct ibv_recv_wr *recv_wr,
-						 struct ibv_recv_wr **bad_recv_wr);
-	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
-	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-					    int attr_mask,
-					    struct ibv_qp_init_attr *init_attr);
-	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-					     int attr_mask);
-	int			(*destroy_qp)(struct ibv_qp *qp);
-	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
-					     struct ibv_send_wr **bad_wr);
-	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
-					     struct ibv_recv_wr **bad_wr);
-	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
-	int			(*destroy_ah)(struct ibv_ah *ah);
-	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
-						uint16_t lid);
-	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
-						uint16_t lid);
-	void			(*async_event)(struct ibv_async_event *event);
-};
-
-struct ibv_context {
-	struct ibv_device      *device;
-	struct ibv_context_ops	ops;
-	int			cmd_fd;
-	int			async_fd;
-	int			num_comp_vectors;
-	pthread_mutex_t		mutex;
-	void		       *abi_compat;
-};
-
-enum verbs_context_mask {
-	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
-	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
-	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
-	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
-	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
-};
-
-struct verbs_context {
-	/*  "grows up" - new fields go here */
-	int (*_reserved_2) (void);
-	int (*destroy_flow) (struct ibv_flow *flow);
-	int (*_reserved_1) (void);
-	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
-					  struct ibv_flow_attr *flow_attr);
-	struct ibv_qp * (*open_qp)(struct ibv_context *context,
-			struct ibv_qp_open_attr *attr);
-	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
-			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
-	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
-	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
-			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
-	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
-			struct ibv_xrcd_init_attr *xrcd_init_attr);
-	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
-	uint64_t has_comp_mask;
-	size_t   sz;	/* Must be immediately before struct ibv_context */
-	struct ibv_context context;/* Must be last field in the struct */
-};
-
-/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
-{
-	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
-		NULL : container_of(ctx, struct verbs_context, context);
-}
-
-#define verbs_get_ctx_op(ctx, op) ({ \
-	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
-	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
-	!_vctx->op) ? NULL : _vctx; })*/
-
-#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
-	struct verbs_context *vctx = _vctx; \
-	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
-		vctx->op = ptr; })
-
-static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
-{
-	return (dev->ops.alloc_context) ?
-		NULL : container_of(dev, struct verbs_device, device);
-}
-
 typedef enum ibv_return_enum
 {
     IBV_SUCCESS = 0,                   //!< The operation was successful
 } ibv_return_t;
 
 ncclResult_t wrap_ibv_symbols(void);
+/* NCCL wrappers of IB verbs functions */
 ncclResult_t wrap_ibv_fork_init(void);
 ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
 ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
@@ -1087,9 +66,6 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
 ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
-static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
-  return qp->context->ops.post_send(qp, wr, bad_wr);
-}
 
 static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
   int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/include/info.h b/src/include/info.h
index 193d820f51..ab07b5e814 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -26,6 +26,7 @@ typedef enum : uint8_t {
   ncclPatternCollnetChain,
   ncclPatternCollnetDirect,
   ncclPatternNvls,
+  ncclPatternNvlsTree,
   ncclPatternSend,
   ncclPatternRecv
 } ncclPattern_t;
@@ -94,7 +95,6 @@ struct ncclCudaStreamList {
   struct ncclCudaStreamList *next;
   cudaStream_t stream;
 };
-
 struct ncclTasks {
   struct Peer {
     bool sendSeen, recvSeen;
@@ -104,7 +104,8 @@ struct ncclTasks {
   struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
   size_t collBytesTotal;
   struct Peer* peers/*[nRanks]*/;
-  int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
+  int *p2pSendOrder, *p2pRecvOrder;
+  int p2pOrderSteps;
   int nTasksColl, nTasksP2p;
 
   // The list of user streams aggregated over all tasks present.
diff --git a/src/include/net.h b/src/include/net.h
index 5a7b5e3a74..b5df589683 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 int ncclNetVersion(struct ncclComm* comm);
 
-// Translation to external API
-static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
-static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
-/* DMA-BUF support */
-static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
-
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
 
diff --git a/src/include/nvtx3/nvtx3.hpp b/src/include/nvtx3/nvtx3.hpp
index cb0ef6858f..8c62acd469 100644
--- a/src/include/nvtx3/nvtx3.hpp
+++ b/src/include/nvtx3/nvtx3.hpp
@@ -126,7 +126,7 @@
  * Systems:
  *
  * \image html
- * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
+ * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
  *
  * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
  * ranges to your code that automatically use the name of the enclosing function
@@ -561,18 +561,27 @@
 
 /* Temporary helper #defines, removed with #undef at end of header */
 
-#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
-#if defined(_MSC_VER) && _MSC_VER < 1914
-/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
- * that does not work with domain::get's specialization for domain::global,
- * and would require extra conditions to make SFINAE work for the overloaded
- * get() functions.  This macro disables use of overloaded get() in order to
- * work with VS2015 and versions of VS2017 below 15.7, without penalizing
- * users of newer compilers.  Building with this flag set to 0 means errors
- * when defining tag structs (see documentation for domain, named_category,
- * and registered_string) will have more complex compiler error messages
- * instead of the clear static_assert messages from the get() overloads.
+/* Some compilers do not correctly support SFINAE, which is used in this API
+ * to detect common usage errors and provide clearer error messages (by using
+ * static_assert) than the compiler would produce otherwise.  These compilers
+ * will generate errors while compiling this file such as:
+ *
+ *  error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’
+ *
+ * The following compiler versions are known to have this problem, and so are
+ * set by default to disable the SFINAE-based checks:
+ *
+ * - All MSVC versions prior to VS2017 Update 7 (15.7)
+ * - GCC 8.1-8.3 (the problem was fixed in GCC 8.4)
+ *
+ * If you find your compiler hits this problem, you can work around it by
+ * defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this
+ * header, or you can add a check for your compiler version to this #if.
+ * Also, please report the issue on the NVTX github page.
  */
+#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
+#if defined(_MSC_VER) && _MSC_VER < 1914 \
+  || defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4
 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
 #else
 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
index c2c1ac596f..7c166bd34b 100644
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
+++ b/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
@@ -1,30 +1,33 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
 #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
 #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
 #endif
 
-/*
- * Helper array to get the alignment for each predefined C language type.
- */
-
 typedef void* pointer_type;
 
-#if __STDC_VERSION__ >= 201112L /* or CPP11 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+#include <uchar.h>
 #include <stdalign.h>
+#endif
+
+/* `alignof` is available as of C11 or C++11 */
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
 #define nvtx_alignof(type) alignof(type)
 #define nvtx_alignof2(type,tname) alignof(type)
-#else /*  __STDC_VERSION__ >= 201112L */
-#ifndef __cplusplus
 
-#include <stddef.h>
-#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
-#define nvtx_alignof2(type,tname) nvtx_alignof(type)
+#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
 
-#else /* __cplusplus */
-
-#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
-#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
-#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
-#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
+/* Create helper structs to determine type alignment. */
+#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
+#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
 
 MKTYPEDEF(char);
 MKTYPEDEF2(unsigned char, uchar);
@@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
 MKTYPEDEF(pointer_type);
 
 MKTYPEDEF(wchar_t);
-#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
-    {sizeof(char8_t), nvtx_alignof(char8_t)},
+
+/* `char8_t` is available as of C++20 or C23 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
     MKTYPEDEF(char8_t);
 #endif
-#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
+/* `char16_t` and `char32_t` are available as of C++11 or C11 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
     MKTYPEDEF(char16_t);
     MKTYPEDEF(char32_t);
 #endif
 
+/* C requires to include stddef.h to use `offsetof` */
+#ifndef __cplusplus
+#include <stddef.h>
+#endif
+
+#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
+#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
+
+#endif /*  __STDC_VERSION__ >= 201112L */
+
 #undef MKTYPEDEF
 #undef MKTYPEDEF2
 
-#endif /* __cplusplus */
-#endif /*  __STDC_VERSION__ >= 201112L */
-
 /*
+ * Helper array to get the alignment for each predefined C/C++ language type.
  * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
  */
 const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
@@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
 
     /*** Special character types ***/
     /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
-    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
-#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
-    {sizeof(char8_t), nvtx_alignof(char8_t)},
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
 #else
-    {0, 0},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
 #endif
-#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
     /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
     /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
 #else
@@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
 };
 
 #undef nvtx_alignof
-#undef nvtx_alignof2
\ No newline at end of file
+#undef nvtx_alignof2
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 69d1ea77c1..426a15017a 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -9,4 +9,21 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_
 
+#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+
+typedef struct {
+  int data; // Currently only support an fd based descriptor
+} ncclCuDesc;
+
+typedef union {
+  // Legacy CUDA IPC
+  cudaIpcMemHandle_t devIpc;
+  // cuMem API support
+  ncclCuDesc cuDesc;
+} ncclIpcDesc;
+
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
+ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
+
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 83b8937861..17db4bcef0 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -15,11 +15,13 @@
 #include "ipcsocket.h"
 #include <pthread.h>
 #include "shm.h"
+#include "p2p.h"
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+enum { proxyRecv=0, proxySend=1 };
 
 struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
 
 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
 static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
@@ -130,18 +132,11 @@ struct ncclProxySharedP2p {
   int size;
   char* cudaBuff;
   char* hostBuff;
-  cudaIpcMemHandle_t ipc;
+  // CUDA IPC
+  ncclIpcDesc ipcDesc;
   struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
 };
 
-struct ncclProxySharedCollNet {
-  int size;
-  char* cudaBuff;
-  char* hostBuff;
-  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
-  void* resources;
-};
-
 struct ncclProxyPeer {
   struct ncclProxySharedP2p send;
   struct ncclProxySharedP2p recv;
@@ -165,7 +160,6 @@ struct ncclProxyProgressState {
   bool stop;
   struct ncclProxyPeer** localPeers;
   struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
-  struct ncclProxySharedCollNet collNet;
   struct ncclProxyArgs* active;
   struct ncclProxyArgs* pool;
   struct ncclProxyPool* pools;
@@ -192,12 +186,27 @@ struct ncclProxyAsyncOp {
 
 struct ncclProxyLocalPeer {
   struct ncclSocket sock;
-  int localRank;
+  int tpRank;
+  int tpLocalRank;
   ncclProxyAsyncOp* asyncOps;
   int asyncOpCounter;
 };
 
 struct ncclProxyState {
+  int refCount;
+  int tpRank;
+  int tpnRanks;
+  int tpLocalnRanks;
+  int cudaDev;
+  int p2pnChannels;
+  int p2pChunkSize;
+  int nChannels;
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  bool allocP2pNetLLBuffers;
+  bool dmaBufSupport;
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
+  volatile uint32_t* abortFlag;
   // Service thread
   pthread_t thread;
   struct ncclSocket* listenSock;
@@ -209,6 +218,7 @@ struct ncclProxyState {
   struct ncclSocket* peerSocks;
   struct ncclProxyOps* proxyOps;
   void** sharedDevMems;
+  struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
 
   // Progress thread
   struct ncclProxyProgressState progressState;
@@ -228,13 +238,14 @@ enum proxyConnectState {
 
 struct ncclProxyConnection {
   int send, transport, shared;
-  int localRank;
+  int tpLocalRank, sameProcess;
   struct ncclSocket* sock;
   struct ncclTransportComm* tcomm;
   struct ncclProxyArgs *proxyAppend;
   struct ncclProxyArgs **proxyAppendPtr;
   void* transportResources;
   proxyConnectState state;
+  struct ncclCollNetSharedRes* collNet;
 };
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
-ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
 enum ncclProxyMsgType {
   ncclProxyMsgInit = 1,
   ncclProxyMsgSharedInit = 2,
@@ -260,22 +271,24 @@ enum ncclProxyMsgType {
   ncclProxyMsgClose = 6,
   ncclProxyMsgAbort = 7,
   ncclProxyMsgStop = 8,
-  ncclProxyMsgConvertFd = 9 // cuMem API support
+  ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
 };
 
 // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
 // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
 // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
-ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
+ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
 
 // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
-ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
-ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
+ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
 
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
+
+ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
 
-enum { proxyRecv=0, proxySend=1 };
-ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
+ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
 
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 01812af1a3..f3f47065a2 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -36,7 +36,6 @@ struct ncclComm;
 struct ncclPeerInfo {
   int rank;
   int cudaDev;
-  int netDev;
   int gdrSupport;
   bool hasFineGrain;
   uint64_t hostHash;
@@ -45,7 +44,6 @@ struct ncclPeerInfo {
   int64_t busId;
   struct ncclComm* comm;
   int cudaCompCap;
-  int virtualId;
 };
 
 #define CONNECT_SIZE 128
@@ -53,15 +51,46 @@ struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
+#if CUDART_VERSION >= 12010
+
+#define NVLS_HANDLE_SIZE 64
+struct ncclNvlsSharedRes {
+  int refCount;
+  CUmulticastObjectProp properties;
+  CUmemAccessDesc accessDesc;
+  int dev;
+  size_t size;
+  size_t granularity;
+  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
+  char* mcBuff; // Multicast NVLS buffer address
+  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
+  char* ucBuff; // Unicast NVLS buffer address
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  int nChannels;
+};
+
+#endif /* CUDART_VERSION >= 12010 */
+
+struct ncclCollNetSharedRes {
+  int refCount;
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+  void* resources;
+  int nChannels;
+  size_t buffSize;
+};
+
 struct ncclTransportComm {
   ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
   ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
   ncclResult_t (*free)(struct ncclConnector*);
-  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
-  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
-  ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
+  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
+  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
 };
 
 struct ncclTransport {
@@ -74,10 +103,9 @@ struct ncclTransport {
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
 
-#if CUDART_VERSION >= 12010
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
+ncclResult_t ncclNvlsInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
-#endif
 
 enum { collNetRecv=0, collNetSend=1 };
 int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
diff --git a/src/init.cc b/src/init.cc
index 9b035f3faa..9778cb7bda 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -53,7 +53,7 @@
 #endif
 
 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAllPivot" };
-const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" };
+const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 const char* ncclDevRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "Max", "Min", "PreMulSum", "SumPostDiv" };
 const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"};
@@ -64,6 +64,7 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
 
 struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
+static ncclResult_t commReclaim(ncclComm_t comm);
 
 static uint64_t hashUniqueId(ncclUniqueId const &id) {
   char const *bytes = (char const*)&id;
@@ -284,7 +285,7 @@ void ncclCommPushFree(struct ncclComm* comm, void* obj) {
 }
 
 static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) {
-  CUDACHECK(cudaFree(dtor->obj));
+  NCCLCHECK(ncclCudaFree(dtor->obj));
   return ncclSuccess;
 }
 void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
@@ -327,8 +328,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
    * free all intra-process communicators; therefore, we only need to focus on local
    * resource cleanup in commFree(). */
-  if (comm->proxyState.thread)
-    pthread_join(comm->proxyState.thread, nullptr);
+  if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
+    pthread_join(comm->proxyState->thread, nullptr);
+  }
 
   delete[] comm->userRedOps;
 
@@ -368,19 +370,29 @@ static ncclResult_t commFree(ncclComm_t comm) {
   }
   free(comm->rankToNode);
   free(comm->rankToLocalRank);
+  free(comm->collNetHeads);
 
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
 
   for (int channel=0; channel<MAXCHANNELS; channel++)
-    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
+    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks, 1, comm->localRanks));
 
   if (comm->doneEvent != NULL)
     CUDACHECK(hipEventDestroy(comm->doneEvent));
 
-  if (comm->initState == ncclSuccess) {
-    NCCLCHECK(ncclStrongStreamDestruct(&comm->hostStream));
-    NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
+  if (comm->sharedRes) {
+    if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) {
+      for (int c=0; c<MAXCHANNELS; c++) {
+        if (comm->sharedRes->peers[c]) free(comm->sharedRes->peers[c]);
+        if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]);
+      }
+      free(comm->sharedRes->tpRankToLocalRank);
+      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
+      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
+      NCCLCHECK(ncclProxyDestroy(comm));
+      free(comm->sharedRes);
+    }
   }
 
 #if CUDART_VERSION >= 12010
@@ -397,8 +409,14 @@ static ncclResult_t commFree(ncclComm_t comm) {
   ncclMemoryStackDestruct(&comm->memScoped);
   ncclMemoryStackDestruct(&comm->memPermanent);
 
-  ncclCudaHostFree((void *)comm->abortFlag);
-  free(comm->netName);
+  if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
+    NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
+    free(comm->abortFlagRefCount);
+  }
+  free((void*)comm->config.netName);
+
+  free(comm->topParentRanks);
+  free(comm->topParentLocalRanks);
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
   free(comm);
@@ -460,7 +478,7 @@ exit:
   return ret;
 }
 
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtualId) {
+static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
   if (ndev < 1) {
     WARN("invalid device count (%d) requested", ndev);
     return ncclInvalidArgument;
@@ -470,20 +488,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
     return ncclInvalidArgument;
   }
 
-  struct ncclComm* comm;
-  /* Cuurently we calloc comm in ncclCommInitRankDev for async function support.
-   * This 'if' structure is designed to consider the case where commAlloc is called
-   * in other cases except ncclCommInitRankDev. */
-  if (*comret == NULL) {
-    /* user requests a new communicator */
-    NCCLCHECK(ncclCalloc(&comm, 1));
-    NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
-    NCCLCHECK(ncclCommSetAsyncError(comm, ncclInProgress));
-  } else {
-    /* We already allocated a communicator in ncclCommInitRankDev. */
-    comm = *comret;
-  }
-
   ncclMemoryStackConstruct(&comm->memPermanent);
   ncclMemoryStackConstruct(&comm->memScoped);
   comm->destructorHead = nullptr;
@@ -491,8 +495,14 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
   comm->nRanks = ndev;
 
   NCCLCHECK(ncclNetInit(comm));
-  INFO(NCCL_INIT, "Using network %s", ncclNetName(comm));
+  INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
+  if (parent && parent->config.splitShare) {
+    if (parent->ncclNet != comm->ncclNet) {
+      WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
+      return ncclInvalidUsage;
+    }
+  }
   // Try to create a CUDA object right away. If there is something wrong with
   // the device we're on (failure cause #1) , better know it early.
   hipEvent_t doneEvent;
@@ -502,13 +512,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
   CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
 #endif
 
-  NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream));
-  NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream));
 
   comm->doneEvent = doneEvent;
   comm->lastStream = nullptr;
-  comm->virtualId = virtualId;
-  cudaGetDevice(&comm->cudaDev);
+  CUDACHECK(cudaGetDevice(&comm->cudaDev));
+
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   comm->compCap = ncclCudaCompCap();
   TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
@@ -529,6 +537,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
     comm->collTraceThread = 0;
 #endif
   comm->collNetSupport = 0;
+  memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
 
   ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
   ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
@@ -546,10 +555,30 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
   // Mark channels as non initialized.
   for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
 
-  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+  if (parent == NULL || !parent->config.splitShare) {
+    struct ncclSharedResources* sharedRes = NULL;
+    NCCLCHECK(ncclCalloc(&sharedRes, 1));
+    /* most of attributes are assigned later in initTransportsRank(). */
+    sharedRes->owner = comm;
+    sharedRes->tpNRanks = comm->nRanks;
+    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    comm->sharedRes = sharedRes;
+    sharedRes->refCount = 1;
+  } else {
+    comm->sharedRes = parent->sharedRes;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+  }
 
   CUDACHECK(hipDeviceGetAttribute(&comm->WarpSize, hipDeviceAttributeWarpSize, comm->cudaDev));
-  *comret = comm;
+  if (comm->topParentRanks == NULL) {
+    NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+    for (int i = 0; i < comm->nRanks; ++i)
+      comm->topParentRanks[i] = i;
+  }
+
+  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
   return ncclSuccess;
 }
 
@@ -559,8 +588,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels tmpCommAndChans;
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->deviceStream), ret, fail);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
   comm->devComm = &devCommAndChans->comm;
   tmpCommAndChans.comm.rank = comm->rank;
@@ -608,7 +637,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
 
     if (comm->channels[c].ring.userRanks != nullptr) {
-      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->deviceStream.cudaStream), ret, fail);
+      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
     }
   }
 
@@ -633,10 +662,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     NCCLCHECK(mscclInit(comm));
   }
 
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
 exit:
-  CUDACHECK(cudaStreamSynchronize(comm->deviceStream.cudaStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
+  CUDACHECK(cudaStreamSynchronize(comm->sharedRes->deviceStream.cudaStream));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
   return ret;
 fail:
   goto exit;
@@ -661,7 +690,6 @@ static void showVersion() {
 
 static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
   info->rank = comm->rank;
-  info->virtualId = comm->virtualId;
   CUDACHECK(cudaGetDevice(&info->cudaDev));
   info->hostHash=getHostHash()+commHash;
   info->pidHash=getPidHash()+commHash;
@@ -689,7 +717,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   comm->hasFineGrain = info->hasFineGrain;
 
   info->comm = comm;
-  info->cudaCompCap = ncclCudaCompCap();
+  info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
   return ncclSuccess;
 }
 
@@ -739,6 +767,13 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
   else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
   else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
+  if (comm->sharedRes->owner != comm) {
+    /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
+    comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
+  } else {
+    comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize;
+  }
+
   INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize);
   return ncclSuccess;
 }
@@ -748,7 +783,7 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
 NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0);
 NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
 
-static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) {
+static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) {
   ncclResult_t ret = ncclSuccess;
   int* heads = NULL;
   int rank = comm->rank;
@@ -758,6 +793,13 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
   int nHeads = collNetGraph->nChannels;
   int highestTransportType0, highestTransportType1;
   char line[1024];
+  bool share;
+
+  struct collnetShareInfo {
+    int headPosition;
+    int isMaster;
+  };
+  struct collnetShareInfo* infos = NULL;
 
   NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
   // Head GPU index is always 0
@@ -765,18 +807,124 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
     heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
   }
 
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    for (int h = 0; h < nHeads; h++) {
-      const int head = heads[h];
-      collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
-      if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
+  comm->collNetHeads = heads;
+  comm->collNetHeadsNum = nHeads;
+  if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
+    NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
+    /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
+     * based on heads with the same head position in each node, as long as the collnet heads of child comm
+     * can match parent's heads, we can let child communicator share parent's collnet resources. */
+    for (int h = 0; h < nHeads; ++h) {
+      int prev = INT_MIN;
+      struct collnetShareInfo* myinfo;
+
+      share = true;
+      myinfo = infos + comm->rank;
+      memset(myinfo, 0, sizeof(struct collnetShareInfo));
+      /* find the child head position in parent collnet heads. */
+      if (heads[h] == comm->rank) {
+        myinfo->headPosition = -1;
+        myinfo->isMaster = 1;
+        for (int th = 0; th < parent->collNetHeadsNum; ++th)
+          if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) {
+            myinfo->headPosition = th;
+            break;
+          }
+      }
+
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail);
+      for (int i = 0; i < comm->nRanks; ++i) {
+        if (infos[i].isMaster) {
+          if (prev == INT_MIN)
+            prev = infos[i].headPosition;
+
+          if (infos[i].headPosition == -1 || prev != infos[i].headPosition) {
+            share = false;
+            break;
+          }
+        }
+      }
+
+      if (share) {
+        if (myinfo->isMaster) {
+          comm->collNetSharedRes = parent->collNetSharedRes;
+          comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels);
+          for (int c = 0; c < comm->collNetChannels; ++c)
+            NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
+        }
+      } else {
+        /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
+         * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
+         * lifted by sharp plugin/IB hardware in the future. */
+        collNetSetupFail = 1;
+        if (comm->rank == 0) {
+          WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks);
+        }
+        goto fail;
+      }
     }
-    // Verify CollNet setup across ranks after trying the first channel
-    if (c == 0) {
-      NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+    share = true;
+  } else {
+    /* this allocated buffer will be freed on proxy side */
+    NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
+    /* TODO: min or max? */
+    comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels);
+    comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
+    for (int c = 0; c < comm->collNetChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
+      for (int h = 0; h < nHeads; h++) {
+        const int head = heads[h];
+        collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
+        if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
+      }
+      // Verify CollNet setup across ranks after trying the first channel
+      if (c == 0) {
+        NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+      }
     }
+    share = false;
   }
+
+  if (share) {
+    memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix));
+  } else {
+    do {
+      /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some
+      ranks don't connect to sharp we enable a (redop,type) if any rank claims
+      support. */
+      const ncclRedOp_t redops[] = {ncclSum, ncclProd, ncclMin, ncclMax};
+      uint8_t(*matrix)[4][ncclNumTypes];
+      bool isHead = false;
+      matrix = nullptr;
+      NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
+      for (int h = 0; h < nHeads; h++) isHead |= (heads[h] == comm->rank);
+      if (isHead) {
+        for (int ty=0; ty < ncclNumTypes; ty++) {
+          for (int i=0; i < 4; i++) {
+            int support = 0;
+            NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, redops[i], &support), ret, matrix_end);
+            // bit 0 = not supported, bit 1 = supported
+            matrix[rank][redops[i]][ty] = 1<<(support ? 1 : 0);
+          }
+        }
+      }
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end);
+      for (int ty=0; ty < ncclNumTypes; ty++) {
+        for (int i=0; i < 4; i++) {
+          int op = redops[i];
+          uint8_t accum = 0;
+          for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty];
+          // We support (redop, type) if some rank supports it and no rank doesn't support it
+          comm->collNetSupportMatrix[op][ty] = (accum == (1<<1));
+        }
+      }
+    matrix_end:
+      free(matrix);
+      if (ret != ncclSuccess) goto fail;
+    } while (0);
+  }
+
   // Verify CollNet setup across ranks after trying all channels
   NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
   TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
@@ -819,6 +967,9 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
   // Exchange highest intra-node transport type among ranks
   // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
   comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+  if (share) {
+    comm->intraHighestTransportType = std::max(comm->intraHighestTransportType, parent->intraHighestTransportType);
+  }
   NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
   for (int i = 0; i < comm->localRanks; i++) {
     if (highestTypes[i] > comm->intraHighestTransportType)
@@ -828,7 +979,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
   INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
 
 exit:
-  free(heads);
+  free(infos);
   return ret;
 fail:
   ncclTransportCollNetFree(comm);
@@ -836,18 +987,19 @@ fail:
   goto exit;
 }
 
-static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
   // 2. { nChannels, graphInfo, topoRanks }
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
   cpu_set_t affinitySave;
   struct ncclTopoGraph ringGraph;
   struct ncclTopoGraph treeGraph;
   struct ncclTopoGraph collNetGraph;
+  struct ncclTopoGraph nvlsGraph;
+  struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
 
   struct graphInfo {
     int pattern;
@@ -860,11 +1012,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   };
 
   struct allGatherInfo {
-    int netDev;
-    int collNetSupport;
-    struct graphInfo tree;
-    struct graphInfo ring;
-    struct graphInfo collNet;
+    struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
     struct ncclTopoRanks topoRanks;
     int nc;
     bool pivotA2AEnabled;
@@ -880,38 +1028,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int* nvbPeers = NULL;
   struct ncclProxyConnector proxyConn;
   int* pxnPeers = NULL;
-
-  TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail);
+  int *topParentLocalRanks = NULL;
+  int tpProxyRank;
 
   // AllGather1 - begin
   NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
-  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail);
+  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
 
-  //If virtualId == -1 multiRank support has not been requested by user, using original interface
-  if (comm->virtualId == -1) {
-    for (int i = 0; i < nranks; i++) {
-      if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
-        WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
-        ret = ncclInvalidUsage;
-        goto fail;
-      }
-    }
-  }
-  else {
-    //Multiple ranks can use the same device, but need to have different virtualId's.
-    for (int i = 0; i < nranks; i++) {
-      for (int j=0; j < nranks; j++) {
-      	if (j==i) continue;
-      	if((comm->peerInfo[i].hostHash  == comm->peerInfo[j].hostHash)  &&
-      	   (comm->peerInfo[i].busId     == comm->peerInfo[j].busId)     &&
-      	   (comm->peerInfo[i].virtualId == comm->peerInfo[j].virtualId)) {
-      	  WARN("Duplicate virtualId detected : rank %d and rank %d both on GPU device %lx virtualId %d",
-      	       i, j, comm->peerInfo[rank].busId, comm->peerInfo[i].virtualId);
-      	  return ncclInvalidUsage;
-      	}
-      }
+  for (int i = 0; i < nranks; i++) {
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+      ret = ncclInvalidUsage;
+      goto fail;
     }
   }
   // AllGather1 - end
@@ -919,6 +1048,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   do {
     // Compute intra-process ranks
     int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
+    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
     for (int i = 0; i < nranks; i++) {
       if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
           && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
@@ -983,8 +1114,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
   }
 
-  // Launch proxy service thread
-  NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  // Determine local CollNet support
+  if (collNetSupport(comm)) {
+    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    if (collNetEnable != NULL) {
+      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
+      if (strcmp(collNetEnable, "1") == 0) {
+        comm->collNetSupport = 1;
+      }
+    }
+  }
+
+  // Determine local Nvls support
+  NCCLCHECK(ncclNvlsInit(comm));
 
   // Get rings and trees
   ringGraph.id = 0;
@@ -1007,8 +1149,24 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
   collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
-  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
+  if (comm->collNetSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
+  } else {
+    collNetGraph.nChannels = 0;
+  }
+
+  nvlsGraph.id = 3;
+  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
+  nvlsGraph.collNet = 0;
+  nvlsGraph.minChannels = 1;
+  nvlsGraph.maxChannels = MAXCHANNELS;
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
+  } else {
+    nvlsGraph.nChannels = 0;
+  }
 
   bool allXgmi, hasPeerAccess;
   allXgmi = true;
@@ -1036,22 +1194,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
 
   if (comm->rank == ncclParamGraphDumpFileRank()) {
-    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
-    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail);
+    struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph };
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail);
   }
 
-  // Determine local CollNet support before all-gather
-  if (collNetSupport(comm)) {
-    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
-    if (collNetEnable != NULL) {
-      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
-      if (strcmp(collNetEnable, "1") == 0) {
-        comm->collNetSupport = 1;
-      }
-    }
-  }
-  if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;
-
   if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
     if (rcclParamP2pNetDisable() == 0) {
       if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1;
@@ -1065,55 +1211,38 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int idx;
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
-  if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-  (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
     allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
-  if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-  (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        (comm->topo->type & RCCL_TOPO_CR8G))
     allGather3Data[rank].nc = 4;
-  if (((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-       (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
-  NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail);
-  allGather3Data[rank].tree.pattern = treeGraph.pattern;
-  allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
-  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
-  allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra;
-  allGather3Data[rank].tree.bwInter = treeGraph.bwInter;
-  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
-  allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
-  allGather3Data[rank].ring.pattern = ringGraph.pattern;
-  allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
-  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
-  allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra;
-  allGather3Data[rank].ring.bwInter = ringGraph.bwInter;
-  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
-  allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
-  allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
-  allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
-  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
-  allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra;
-  allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter;
-  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
-  allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
-  allGather3Data[rank].collNetSupport = comm->collNetSupport;
   allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
   comm->topo->ll128Enabled =  comm->topo->ll128Enabled || rcclParamLL128ForceEnable();
   allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled;
   allGather3Data[rank].mscclEnabled = comm->topo->mscclEnabled;
 
-  comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
-    ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
-  NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail);
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+    allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+    allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+    allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+    allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+    allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+  }
+
+  comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
 
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
 
@@ -1129,7 +1258,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       comm->nNodes++;
       nodesFirstRank[node] = firstRank;
       // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
+      nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
     }
     comm->rankToNode[r] = node;
   }
@@ -1172,32 +1301,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int nc;
   nc = allGather3Data[0].nc;
   for (int i=0; i<nranks; i++) {
-    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     nc = std::min(allGather3Data[i].nc, nc);
     // Make sure we align all ranks so that the tuning is consistent across ranks
-    treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
-    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
-    treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra);
-    treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter);
-    treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
-    treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
-    ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
-    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
-    ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra);
-    ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter);
-    ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
-    ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
-    collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
-    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
-    collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra);
-    collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter);
-    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
-    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
-    comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
     comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
     comm->topo->ll128Enabled = comm->topo->ll128Enabled && allGather3Data[i].ll128Enabled;
     comm->topo->mscclEnabled = comm->topo->mscclEnabled && allGather3Data[i].mscclEnabled;
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+      graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+      graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+      graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+      graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+    }
+    if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
+    if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0;
   }
 
   comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
@@ -1226,8 +1345,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail);
 
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nc), ret, fail);
   if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
 
   // AllGather3 - end
@@ -1248,6 +1367,29 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
 
+  // Compute nChannels per peer for p2p
+  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+  /* until now, all info of comm should be known. We can initialize shared resources and
+   * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+   * all proxy operations. */
+  if (comm->sharedRes->owner == comm) {
+    comm->sharedRes->tpNLocalRanks = comm->localRanks;
+    comm->sharedRes->magic = comm->magic;
+    comm->sharedRes->tpNChannels = comm->nChannels;
+    comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+    memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+  }
+  NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+  for (int i = 0; i < comm->localRanks; ++i) {
+    int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+    topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+  }
+  comm->topParentLocalRanks = topParentLocalRanks;
+
+  // Launch proxy service thread, after this, the proxy calls can be used.
+  NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+
   // Connect with prev/next for each ring
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
@@ -1278,42 +1420,44 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
   INFO(NCCL_INIT, "Connected all trees");
 
-  // Check if we can setup CollNet
-  if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
+  // Setup NVLS
+  NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+  // And NVLS trees if needed
+  if (comm->nvlsSupport && comm->localRanks > 1) {
+    for (int c=0; c<comm->nvlsChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail);
+    INFO(NCCL_INIT, "Connected NVLS tree");
+  }
 
 #if CUDART_VERSION >= 12010
-  NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail);
+  // Check if we can setup CollNet
+  if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph);
 #endif
 
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
-  do {
-    int myCompCap = comm->peerInfo[rank].cudaCompCap;
-    int minCompCap = myCompCap, maxCompCap = myCompCap;
-    for (int i = 0; i < nranks; i++) {
-      comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
-      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
-    }
-    NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
-  } while(0);
-
-  // Compute nChannels per peer for p2p
-  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
 
   INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
 
   do { // Setup p2p structures in comm->tasks
     struct ncclTasks* tasks = &comm->tasks;
-    int nRanks = comm->nRanks;
     int node = comm->node;
     int nNodes = comm->nNodes;
     struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
     int localRank = comm->localRank;
-    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
-    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-    int s=0, r=0;
+    // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8.
+    int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2);
+    tasks->p2pOrderSteps = comm->nNodes * steps;
+    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, tasks->p2pOrderSteps);
+    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
+    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
+    int i=0;
     // schedule delta 0, +1, -1, +2, -2, ...
     // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
     for (int d=0; d <= nNodes/4; d++) {
@@ -1323,18 +1467,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     sched_delta:
       int recvNode = (node+nNodes-delta)%nNodes;
       int sendNode = (node+delta)%nNodes;
-      int steps = comm->maxLocalRanks;
       for (int step=0; step < steps; step++) {
         int recvIndex = (localRank-step+steps)%steps;
-        if (recvIndex < nodeRanks[recvNode].localRanks) {
-          tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
-          r++;
-        }
+	int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+        tasks->p2pRecvOrder[i] = recvRank;
         int sendIndex = (localRank+step)%steps;
-        if (sendIndex < nodeRanks[sendNode].localRanks) {
-          tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
-          s++;
-        }
+        int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
+        tasks->p2pSendOrder[i] = sendRank;
+        i++;
       }
       index++;
       if (index == 1 && deltas[1] == deltas[0]) index++;
@@ -1346,7 +1486,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
         goto sched_delta;
       }
     }
-    assert(s == nRanks && r == nRanks);
+    assert(i == tasks->p2pOrderSteps);
   } while (0);
 
   if (ncclParamNvbPreconnect()) {
@@ -1358,13 +1498,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       int channelId;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
+        if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
           comm->connectSend[peer] |= (1UL<<channelId);
         }
       }
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
+        if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
           comm->connectRecv[peer] |= (1UL<<channelId);
         }
       }
@@ -1374,16 +1514,18 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   }
 
   // Connect to local net proxy
-  NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
-  NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+  tpProxyRank = comm->topParentRanks[comm->rank];
+  NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+  NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
 
   // Then to remote ones when using PXN
   if (ncclPxnDisable(comm) == 0) {
     int nranks;
     NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
     for (int r=0; r<nranks; r++) {
-      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
-      NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+      tpProxyRank = comm->topParentRanks[pxnPeers[r]];
+      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+      NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
     }
   }
 
@@ -1416,8 +1558,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
 exit:
   if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  // Unlink proxy shm to make sure it will be properly cleaned up.
-  ncclProxyShmUnlink(comm);
+  /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+   * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+   * properly cleaned up. */
+  if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm);
   free(allTopoRanks);
   free(nodesTreePatterns);
   free(nodesFirstRank);
@@ -1445,11 +1589,15 @@ NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 
 struct ncclCommInitRankAsyncJob {
   struct ncclAsyncJob base;
-  ncclComm_t* newcomm;
+  struct ncclComm* comm;
+  struct ncclComm** newcomm;
+  int cudaDev;
+  // For ncclCommInitRank
   int nranks, myrank;
   ncclUniqueId commId;
-  int cudaDev;
-  int virtualId;
+  // for ncclCommSplit
+  struct ncclComm* parent;
+  int color, key;
 };
 
 struct ncclCommFinalizeAsyncJob {
@@ -1457,26 +1605,71 @@ struct ncclCommFinalizeAsyncJob {
   ncclComm_t comm;
 };
 
+NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
+
+static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
+  int* colors = NULL;
+  int* keys = NULL;
+  int nRanks = 0, myRank = 0;
+  ncclResult_t ret = ncclSuccess;
+
+  NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail);
+
+  // Compute nRanks, my rank and the ranks (of the original comm) before and after me
+  colors[parent->rank] = color;
+  keys[parent->rank] = key;
+  NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail);
+  NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail);
+
+  // Negative color does not create a new comm. Return now.
+  if (color == NCCL_SPLIT_NOCOLOR) goto exit;
+
+  memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
+  for (int i = 0; i < parent->nRanks; i++) {
+    if (colors[i] != color) continue;
+    // Find where to insert this rank
+    int insert = 0;
+    while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++;
+    // Shift ranks by one after insert
+    for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
+    // Insert our rank
+    parentRanksRet[insert] = i;
+    nRanks++;
+  }
+
+  for (int i = 0; i < nRanks; i++) {
+    if (parentRanksRet[i] == parent->rank) myRank = i;
+  }
+
+  *nRanksRet = nRanks;
+  *myRankRet = myRank;
+
+exit:
+  free(colors);
+  free(keys);
+  return ret;
+fail:
+  goto exit;
+}
+
 static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
-  ncclComm_t* newcomm = job->newcomm;
-  ncclComm_t comm = *newcomm;
-  int nranks = job->nranks;
-  ncclUniqueId commId = job->commId; // C++ struct assignment
-  int myrank = job->myrank;
-  int cudaDev = job->cudaDev;
-  int virtualId = job->virtualId;
+  ncclComm_t comm = job->comm;
+  ncclResult_t res = ncclSuccess;
   int archMajor, archMinor;
   size_t maxLocalSizeBytes = 0;
-  ncclResult_t res = ncclSuccess;
+  int cudaDev = job->cudaDev;
+  int* parentRanks = NULL;
+  int cudaArch;
   int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
 
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
-  CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
-  CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev));
-  comm->cudaArch = 100*archMajor + 10*archMinor;
+  CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
+  cudaArch = 100*archMajor + 10*archMinor;
 
-  NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes));
+  NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
 #ifdef USE_INDIRECT_FUNCTION_CALL
@@ -1485,18 +1678,49 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
   }
 #endif
-  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank, virtualId), res, fail);
-  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, fail);
+
+  if (job->parent) {
+    NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
+    NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
+    // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
+    if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
+    snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
+    NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
+    NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
+  } else {
+    NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
+    NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
+  }
+
+  comm->cudaArch = cudaArch;
+  comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
+
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
+
+  NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
 
   // update communicator state
   comm->initState = ncclSuccess;
 
   // Trace this call for replay tool
-  TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
-    *newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
+  if (job->parent) {
+    /* unlink child abort flag. */
+    __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
+    TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)",
+                job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
+  } else {
+    TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
+                comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
+  }
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %zi used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, maxLocalSizeBytes, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
+
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx commId 0x%llx localSize %zi used %ld bytes - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, (unsigned long long)hashUniqueId(job->commId), maxLocalSizeBytes, allocTracker[comm->cudaDev].totalAllocSize);
 exit:
+  if (job->newcomm) {
+    /* assign it to user pointer. */
+    __atomic_store_n(job->newcomm, comm, __ATOMIC_RELEASE);
+  }
+  free(parentRanks);
   return res;
 fail:
   comm->initState = res;
@@ -1510,14 +1734,88 @@ fail:
     INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
   }
 
-static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+static ncclResult_t envConfigOverride(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
-  /* config must not be NULL in this function */
+  const char* tmpNetName = comm->config.netName;
+  const char* envNetName;
   int blockingEnv;
   int cgaClusterSizeEnv;
   int minCTAsEnv;
   int maxCTAsEnv;
-  const char *envNetName, *tmpNetName;
+  int splitShareEnv;
+
+  /* override configuration from env variable. */
+  blockingEnv = ncclParamCommBlocking();
+  if (blockingEnv == 0 || blockingEnv == 1)
+    comm->config.blocking = blockingEnv;
+
+  cgaClusterSizeEnv = ncclParamCGAClusterSize();
+  if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
+    comm->config.cgaClusterSize = cgaClusterSizeEnv;
+  } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
+    WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
+    comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
+  }
+
+  minCTAsEnv = ncclParamMinCTAs();
+  if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.minCTAs = minCTAsEnv;
+  }
+
+  maxCTAsEnv = ncclParamMaxCTAs();
+  if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.maxCTAs = maxCTAsEnv;
+  }
+
+  envNetName = getenv("NCCL_NET");
+  if (envNetName)
+    tmpNetName = envNetName;
+  if (tmpNetName != NULL) {
+    int netNameLen = strlen(tmpNetName) + 1;
+    comm->config.netName = (char*)malloc(netNameLen);
+    memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
+  } else {
+    comm->config.netName = NULL;
+  }
+
+  splitShareEnv = ncclParamCommSplitShareResources();
+  if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.splitShare = splitShareEnv;
+  }
+
+  /* cap channels if needed */
+  if (comm->config.minCTAs > MAXCHANNELS) {
+    WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
+    comm->config.minCTAs = MAXCHANNELS;
+  }
+
+  if (comm->config.maxCTAs > MAXCHANNELS) {
+    WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
+    comm->config.maxCTAs = MAXCHANNELS;
+  }
+
+  if (comm->config.minCTAs > comm->config.maxCTAs) {
+    WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
+    comm->config.minCTAs = comm->config.maxCTAs;
+  }
+
+  if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
+    WARN("splitShare %d is not a valid value 0/1, set it to 0\n", comm->config.splitShare);
+    comm->config.splitShare = 0;
+  }
+
+  return ret;
+}
+
+static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) {
+  memcpy(&childComm->config, &parnet->config, sizeof(ncclConfig_t));
+  NCCLCHECK(envConfigOverride(childComm));
+  return ncclSuccess;
+}
+
+static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+  ncclResult_t ret = ncclSuccess;
+  /* config must not be NULL in this function */
   ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t *internalConfigPtr;
@@ -1570,71 +1868,29 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
     goto fail;
   }
 
+  if (internalConfigPtr->splitShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->splitShare != 0 && internalConfigPtr->splitShare != 1) {
+    WARN("Invalid config splitShare attribute value %d", internalConfigPtr->splitShare);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
   /* default config value can be tuned on different platform. */
   NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
-
-  tmpNetName = internalConfigPtr->netName;
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
 
   /* assign config to communicator */
-  comm->blocking = internalConfigPtr->blocking;
-  comm->cgaClusterSize = internalConfigPtr->cgaClusterSize;
-  comm->minCTAs = internalConfigPtr->minCTAs;
-  comm->maxCTAs = internalConfigPtr->maxCTAs;
+  comm->config.blocking = internalConfigPtr->blocking;
+  comm->config.cgaClusterSize = internalConfigPtr->cgaClusterSize;
+  comm->config.minCTAs = internalConfigPtr->minCTAs;
+  comm->config.maxCTAs = internalConfigPtr->maxCTAs;
+  comm->config.netName = internalConfigPtr->netName;
+  comm->config.splitShare = internalConfigPtr->splitShare;
 
-  /* override configuration from env variable. */
-  blockingEnv = ncclParamCommBlocking();
-  if (blockingEnv == 0 || blockingEnv == 1)
-    comm->blocking = blockingEnv;
-
-  cgaClusterSizeEnv = ncclParamCGAClusterSize();
-  if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
-    comm->cgaClusterSize = cgaClusterSizeEnv;
-  } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
-    WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
-    comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
-  }
-
-  minCTAsEnv = ncclParamMinCTAs();
-  if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->minCTAs = minCTAsEnv;
-  }
-
-  maxCTAsEnv = ncclParamMaxCTAs();
-  if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->maxCTAs = maxCTAsEnv;
-  }
-
-  /* cap channels if needed */
-  if (comm->minCTAs > MAXCHANNELS) {
-    WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS);
-    comm->minCTAs = MAXCHANNELS;
-  }
-
-  if (comm->maxCTAs > MAXCHANNELS) {
-    WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS);
-    comm->maxCTAs = MAXCHANNELS;
-  }
-
-  if (comm->minCTAs > comm->maxCTAs) {
-    WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs);
-    ret = ncclInvalidArgument;
-    goto fail;
-  }
-
-  envNetName = getenv("NCCL_NET");
-  if (envNetName)
-    tmpNetName = envNetName;
-  if (tmpNetName != NULL) {
-    int netNameLen = strlen(tmpNetName) + 1;
-    comm->netName = (char*)malloc(netNameLen);
-    memcpy(comm->netName, tmpNetName, netNameLen);
-  } else {
-    comm->netName = NULL;
-  }
+  NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
 exit:
   return ret;
@@ -1642,13 +1898,7 @@ fail:
   goto exit;
 }
 
-static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
-  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
-  ncclCommDestroy(*job->newcomm);
-  *job->newcomm = nullptr;
-}
-
-static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config, int virtualId) {
+static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
   ncclResult_t res = ncclSuccess;
   ncclComm_t comm = NULL;
   struct ncclCommInitRankAsyncJob *job = NULL;
@@ -1675,18 +1925,19 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
 
   NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
   NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
+  NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail);
+  *comm->abortFlagRefCount = 1;
   NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
   /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
   comm->initState = ncclInternalError;
   *newcomm = comm;
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
-  job->newcomm = newcomm;
+  job->comm = comm;
   job->nranks = nranks;
   job->commId = commId; // C++ struct assignment
   job->myrank = myrank;
   job->cudaDev = cudaDev;
-  job->virtualId = virtualId;
   NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
 
 exit:
@@ -1694,6 +1945,7 @@ exit:
 fail:
   if (comm) {
     if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag);
+    if (comm->abortFlagRefCount) free(comm->abortFlagRefCount);
     free(comm);
   }
   if (newcomm) *newcomm = NULL;
@@ -1724,27 +1976,10 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
   NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
 
-  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, -1));
+  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
   return ncclSuccess;
 }
 
-NCCL_API(ncclResult_t, ncclCommInitRankMulti, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId);
-ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId) {
-  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
-  if (ncclParamDmaBufEnable()) rocmLibraryInit();
-
-  int cudaDev;
-  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-  CUDACHECK(hipGetDevice(&cudaDev));
-
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
-
-  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, virtualId));
-  return ncclSuccess;
-}
-
-
 NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   ncclResult_t ret = ncclSuccess;
@@ -1794,7 +2029,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
   for (int i=0; i<ndev; i++) {
     // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
-    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config, -1);
+    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
   }
   NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
 
@@ -1829,15 +2064,15 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
     internalConfigPtr = &internalConfig;
   else
     internalConfigPtr = config;
-  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr, -1), ret, fail);
+  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
 
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
   return ret;
 fail:
-  if (newcomm && *newcomm && !(*newcomm)->blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
+  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
   goto exit;
 }
 
@@ -1856,8 +2091,8 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
 
   if (comm->initState == ncclSuccess) {
-    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->deviceStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
   }
   NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
   // And keep polling until all graphs referencing us die.
@@ -1953,10 +2188,10 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (comm && !comm->blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
+  if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
   return ret;
 fail:
-  if (comm && !comm->blocking) (void) ncclCommSetAsyncError(comm, ret);
+  if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
   goto exit;
 }
 
@@ -2001,10 +2236,10 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
         }
       }
 
-      /* ncclProxyDestroy() loop must be put after commDestroySync() loop. Namely, you cannot do:
+      /* ncclProxyStop() loop must be put after commDestroySync() loop. Namely, you cannot do:
        *  while(...) {
        *     commDestroySync(...);
-       *     ncclProxyDestroy(...);
+       *     ncclProxyStop(...);
        *  }
        * Considering one process multi-gpu case, we must guarantee all kernels are complete before
        * we free proxy resources; otherwise, we will face invalid memory issues where proxy connection
@@ -2019,7 +2254,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
         nextIntraComm = nextIntraComm->intraNext;
 
         /* free intraprocess proxy resources. */
-        if ((ret = ncclProxyDestroy(curIntraComm)) != ncclSuccess) {
+        if ((ret = ncclProxyStop(curIntraComm)) != ncclSuccess) {
           WARN("commReclaim: comm %p (rank = %d) destroys proxy resource error %d", curIntraComm, curRank, ret);
         }
       }
@@ -2080,6 +2315,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
     return ncclSuccess;
   }
 
+  volatile uint32_t* childAbortFlag;
   int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
 
   NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
@@ -2089,6 +2325,10 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
 
   // Ask anything that might still be running on the device to quit
+  childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE);
+  if (childAbortFlag != NULL) {
+    *childAbortFlag = 1;
+  }
   *comm->abortFlag = 1;
   /* init thread must be joined before we destroy the comm,
    * and we should ignore the init error here. */
@@ -2100,15 +2340,78 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
+ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
+  struct ncclCommInitRankAsyncJob *job = NULL;
+  struct ncclComm* childComm = NCCL_COMM_NULL;
+  ncclResult_t res = ncclSuccess;
+
+  NCCLCHECK(ncclGroupStartInternal());
+  NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
+  NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
+
+  /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
+  *newcomm = NCCL_COMM_NULL;
+  if (color == NCCL_SPLIT_NOCOLOR) {
+    INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
+  } else {
+    NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
+    if (comm->config.splitShare) {
+      childComm->abortFlag = comm->abortFlag;
+      childComm->abortFlagRefCount = comm->abortFlagRefCount;
+      comm->childAbortFlag = NULL;
+      ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
+    } else {
+      NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&childComm->abortFlag, 1), res, fail);
+      NCCLCHECKGOTO(ncclCalloc((uint32_t**)&childComm->abortFlagRefCount, 1), res, fail);
+      /* temporarily used to abort everything during child comm init. */
+      comm->childAbortFlag = childComm->abortFlag;
+      *childComm->abortFlagRefCount = 1;
+    }
+    if (config == NULL) {
+      NCCLCHECKGOTO(copyCommConfig(childComm, comm), res, fail);
+    } else {
+      NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
+    }
+
+    /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
+    childComm->initState = ncclInternalError;
+  }
+
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = childComm;
+  job->newcomm = newcomm;
+  job->parent = comm;
+  job->color = color;
+  job->key = key;
+  job->cudaDev = comm->cudaDev;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
+
+exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  return res;
+fail:
+  if (childComm) {
+    if (comm && !comm->config.splitShare) {
+      if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag);
+      if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
+    }
+    free(childComm);
+  }
+  if (newcomm) *newcomm = NULL;
+  goto exit;
+}
+
 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
 const char* ncclGetErrorString(ncclResult_t code) {
   switch (code) {
     case ncclSuccess                : return "no error";
-    case ncclUnhandledCudaError     : return "unhandled cuda error";
-    case ncclSystemError            : return "unhandled system error";
-    case ncclInternalError          : return "internal error";
-    case ncclInvalidArgument        : return "invalid argument";
-    case ncclInvalidUsage           : return "invalid usage";
+    case ncclUnhandledCudaError     : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)";
+    case ncclSystemError            : return "unhandled system error (run with NCCL_DEBUG=INFO for details)";
+    case ncclInternalError          : return "internal error - please report this issue to the NCCL developers";
+    case ncclInvalidArgument        : return "invalid argument (run with NCCL_DEBUG=WARN for details)";
+    case ncclInvalidUsage           : return "invalid usage (run with NCCL_DEBUG=WARN for details)";
     case ncclRemoteError            : return "remote process exited or there was a network error";
     case ncclInProgress             : return "NCCL operation in progress";
     default                         : return "unknown result code";
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 4fe90237ce..334ee10f69 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -6,10 +6,46 @@
 
 #include "nccl.h"
 #include "debug.h"
+#include "param.h"
 #include "cudawrap.h"
 
 #include <dlfcn.h>
 
+// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
+NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
+
+static int ncclCuMemSupported = 0;
+
+// Determine whether CUMEM & VMM RDMA is supported on this platform
+int ncclIsCuMemSupported() {
+#if CUDART_VERSION < 11030
+  return 0;
+#else
+  CUdevice currentDev;
+  int cudaDev;
+  int cudaDriverVersion;
+  int flag = 0;
+  ncclResult_t ret = ncclSuccess;
+  CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
+  if (cudaDriverVersion < 12000) return 0;  // Need CUDA_VISIBLE_DEVICES support
+  CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
+  if (CUPFN(cuMemCreate) == NULL) return 0;
+  CUCHECKGOTO(cuDeviceGet(&currentDev, cudaDev), ret, error);
+  // Query device to see if CUMEM VMM support is available
+  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
+  if (!flag) return 0;
+  // Query device to see if CUMEM RDMA support is available
+  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
+  if (!flag) return 0;
+error:
+  return (ret == ncclSuccess);
+#endif
+}
+
+int ncclCuMemEnable() {
+  return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
+}
+
 #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
 
 #if CUDART_VERSION >= 11030
@@ -35,6 +71,7 @@ DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
 DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
 DECLARE_CUDA_PFN(cuMemMap, 10020);
 DECLARE_CUDA_PFN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
 DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN(cuMemUnmap, 10020);
 #if CUDA_VERSION >= 11070
@@ -89,7 +126,6 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuCtxSetCurrent, 4000, 1);
   LOAD_SYM(cuCtxGetDevice, 2000, 1);
 /* cuMem API support */
-#if CUDA_VERSION >= 11030
   LOAD_SYM(cuMemAddressReserve, 10020, 1);
   LOAD_SYM(cuMemAddressFree, 10020, 1);
   LOAD_SYM(cuMemCreate, 10020, 1);
@@ -98,9 +134,9 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
   LOAD_SYM(cuMemMap, 10020, 1);
   LOAD_SYM(cuMemRelease, 10020, 1);
+  LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
   LOAD_SYM(cuMemSetAccess, 10020, 1);
   LOAD_SYM(cuMemUnmap, 10020, 1);
-#endif
 #if CUDA_VERSION >= 11070
   LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
@@ -135,7 +171,7 @@ static void initOnceFunc() {
   if (ncclCudaPath == NULL)
     snprintf(path, 1024, "%s", "libcuda.so");
   else
-    snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
+    snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
 
   (void) dlerror(); // Clear any previous errors
   cudaLib = dlopen(path, RTLD_LAZY);
@@ -195,6 +231,9 @@ static void initOnceFunc() {
   }
   #endif
 
+  // Determine whether we support the cuMem APIs or not
+  ncclCuMemSupported = ncclIsCuMemSupported();
+
   initResult = ncclSuccess;
   return;
 error:
diff --git a/src/misc/ibvsymbols.cc b/src/misc/ibvsymbols.cc
new file mode 100644
index 0000000000..c41a457c8f
--- /dev/null
+++ b/src/misc/ibvsymbols.cc
@@ -0,0 +1,158 @@
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "ibvsymbols.h"
+
+#ifdef NCCL_BUILD_RDMA_CORE
+/* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */
+
+#define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
+
+// Passthrough function for ibv_reg_mr macro in verbs.h
+struct ibv_mr* ibv_internal_reg_mr(
+      struct ibv_pd* pd,
+      void* addr,
+      size_t length,
+      int access) {
+    return ibv_reg_mr(pd, addr, length, access);
+  }
+
+// Passthrough function for ibv_internal_query_port macro in verbs.h
+int ibv_internal_query_port(
+      struct ibv_context* context,
+      uint8_t port_num,
+      struct ibv_port_attr* port_attr) {
+    return ibv_query_port(context, port_num, port_attr);
+}
+
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
+  ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list);
+  ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list);
+  ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name);
+  ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device);
+  ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device);
+  ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event);
+  ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event);
+  ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device);
+  ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid);
+  ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp);
+  ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd);
+  ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd);
+
+  ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2);
+  ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr);
+
+  ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr);
+  ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq);
+  ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq);
+  ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp);
+  ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp);
+  ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
+  ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
+  ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
+
+  ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
+  ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
+
+  return ncclSuccess;
+}
+
+#else
+/* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */
+
+#include <dlfcn.h>
+#include "core.h"
+
+// IBVERBS Library versioning
+#define IBVERBS_VERSION "IBVERBS_1.1"
+
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
+  static void* ibvhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
+  if (!ibvhandle) {
+    ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
+    if (!ibvhandle) {
+      INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
+      goto teardown;
+    }
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {           \
+    cast = (void**)&funcptr;                             \
+    tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
+    if (tmp == NULL) {                                   \
+      WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION);  \
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+// Attempt to load a specific symbol version - fail silently
+#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
+    cast = (void**)&funcptr;                                     \
+    *cast = dlvsym(handle, symbol, version);                     \
+  } while (0)
+
+  LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list);
+  LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list);
+  LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name);
+  LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device);
+  LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device);
+  LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event);
+  LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event);
+  LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device);
+  LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port);
+  LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid);
+  LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp);
+  LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr);
+  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
+  // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
+  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr);
+  LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq);
+  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq);
+  LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp);
+  LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp);
+  LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp);
+  LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
+  LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);
+
+  return ncclSuccess;
+
+teardown:
+  ibvSymbols->ibv_internal_get_device_list = NULL;
+  ibvSymbols->ibv_internal_free_device_list = NULL;
+  ibvSymbols->ibv_internal_get_device_name = NULL;
+  ibvSymbols->ibv_internal_open_device = NULL;
+  ibvSymbols->ibv_internal_close_device = NULL;
+  ibvSymbols->ibv_internal_get_async_event = NULL;
+  ibvSymbols->ibv_internal_ack_async_event = NULL;
+  ibvSymbols->ibv_internal_query_device = NULL;
+  ibvSymbols->ibv_internal_query_port = NULL;
+  ibvSymbols->ibv_internal_query_gid = NULL;
+  ibvSymbols->ibv_internal_query_qp = NULL;
+  ibvSymbols->ibv_internal_alloc_pd = NULL;
+  ibvSymbols->ibv_internal_dealloc_pd = NULL;
+  ibvSymbols->ibv_internal_reg_mr = NULL;
+  ibvSymbols->ibv_internal_reg_mr_iova2 = NULL;
+  ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL;
+  ibvSymbols->ibv_internal_dereg_mr = NULL;
+  ibvSymbols->ibv_internal_create_cq = NULL;
+  ibvSymbols->ibv_internal_destroy_cq = NULL;
+  ibvSymbols->ibv_internal_create_qp = NULL;
+  ibvSymbols->ibv_internal_modify_qp = NULL;
+  ibvSymbols->ibv_internal_destroy_qp = NULL;
+  ibvSymbols->ibv_internal_fork_init = NULL;
+  ibvSymbols->ibv_internal_event_type_str = NULL;
+
+  if (ibvhandle != NULL) dlclose(ibvhandle);
+  return ncclSystemError;
+}
+
+#endif
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 8a736d3cf1..bc896e10eb 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -8,314 +8,186 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include <dlfcn.h>
-#include "core.h"
-
-/*Function Pointers*/
-int (*ibv_internal_fork_init)(void);
-struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
-void (*ibv_internal_free_device_list)(struct ibv_device **list);
-const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
-struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
-int (*ibv_internal_close_device)(struct ibv_context *context);
-int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
-void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
-int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
-int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
-int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
-int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
-struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
-int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
-struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
-struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
-/* DMA-BUF support */
-struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
-int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
-struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
-int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
-struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
-int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
-int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
-const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
-
-// IBVERBS Library versioning
-#define IBVERBS_VERSION "IBVERBS_1.1"
+#include "ibvsymbols.h"
 
 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
 static ncclResult_t initResult;
-
-static void initOnceFunc(void) {
-  static void* ibvhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
-  if (!ibvhandle) {
-    ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
-    if (!ibvhandle) {
-      INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]");
-      goto teardown;
-    }
-  }
-
-#define LOAD_SYM(handle, symbol, funcptr) do {           \
-    cast = (void**)&funcptr;                             \
-    tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
-    if (tmp == NULL) {                                   \
-      WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION);  \
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-// Attempt to load a specific symbol version - fail silently
-#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
-    cast = (void**)&funcptr;                                     \
-    *cast = dlvsym(handle, symbol, version);                     \
-  } while (0)
-
-  LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
-  LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
-  LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
-  LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device);
-  LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device);
-  LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event);
-  LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event);
-  LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device);
-  LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port);
-  LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid);
-  LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp);
-  LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
-  LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
-  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
-  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
-  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
-  // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
-  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
-  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
-  LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
-  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
-  LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp);
-  LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp);
-  LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp);
-  LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
-  LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
-
-  initResult = ncclSuccess;
-  return;
-
-teardown:
-  ibv_internal_get_device_list = NULL;
-  ibv_internal_free_device_list = NULL;
-  ibv_internal_get_device_name = NULL;
-  ibv_internal_open_device = NULL;
-  ibv_internal_close_device = NULL;
-  ibv_internal_get_async_event = NULL;
-  ibv_internal_ack_async_event = NULL;
-  ibv_internal_query_device = NULL;
-  ibv_internal_query_port = NULL;
-  ibv_internal_query_gid = NULL;
-  ibv_internal_query_qp = NULL;
-  ibv_internal_alloc_pd = NULL;
-  ibv_internal_dealloc_pd = NULL;
-  ibv_internal_reg_mr = NULL;
-  ibv_internal_reg_mr_iova2 = NULL;
-  ibv_internal_reg_dmabuf_mr = NULL;
-  ibv_internal_dereg_mr = NULL;
-  ibv_internal_create_cq = NULL;
-  ibv_internal_destroy_cq = NULL;
-  ibv_internal_create_qp = NULL;
-  ibv_internal_modify_qp = NULL;
-  ibv_internal_destroy_qp = NULL;
-  ibv_internal_fork_init = NULL;
-  ibv_internal_event_type_str = NULL;
-
-  if (ibvhandle != NULL) dlclose(ibvhandle);
-  initResult = ncclSystemError;
-  return;
-}
+struct ncclIbvSymbols ibvSymbols;
 
 ncclResult_t wrap_ibv_symbols(void) {
-  pthread_once(&initOnceControl, initOnceFunc);
+  pthread_once(&initOnceControl,
+               [](){ initResult = buildIbvSymbols(&ibvSymbols); });
   return initResult;
 }
 
-#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
-  if (name_internal == NULL) { \
+/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
+#define CHECK_NOT_NULL(container, internal_name) \
+  if (container.internal_name == NULL) { \
      WARN("lib wrapper not initialized."); \
      return ncclInternalError; \
-  } \
-  retval = call; \
+  }
+
+#define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  retval = container.call; \
   if (retval == error_retval) { \
     WARN("Call to " name " failed with error %s", strerror(errno)); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
 
-#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \
-  if (name_internal == NULL) { \
-     WARN("lib wrapper not initialized."); \
-     return ncclInternalError; \
-  } \
-  retval = call; \
+#define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  retval = container.call; \
   if (retval == error_retval) { \
     WARN("Call to " name " failed"); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
 
-#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \
-  if (name_internal == NULL) { \
-     WARN("lib wrapper not initialized."); \
-     return ncclInternalError; \
-  } \
-  int ret = call; \
+#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  int ret = container.call; \
   if (ret != success_retval) { \
     WARN("Call to " name " failed with error %s", strerror(ret)); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
 
-#define IBV_INT_CHECK(name_internal, call, error_retval, name) \
-  if (name_internal == NULL) { \
-     WARN("lib wrapper not initialized."); \
-     return ncclInternalError; \
-  } \
-  int ret = call; \
+#define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  int ret = container.call; \
   if (ret == error_retval) { \
     WARN("Call to " name " failed"); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
 
-#define IBV_PASSTHRU(name_internal, call) \
-  if (name_internal == NULL) { \
-     WARN("lib wrapper not initialized."); \
-     return ncclInternalError; \
-  } \
-  call; \
+#define IBV_PASSTHRU(container, internal_name, call) \
+  CHECK_NOT_NULL(container, internal_name); \
+  container.call; \
   return ncclSuccess;
 
 ncclResult_t wrap_ibv_fork_init() {
-  IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
+  IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
 }
 
 ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
-  *ret = ibv_internal_get_device_list(num_devices);
+  *ret = ibvSymbols.ibv_internal_get_device_list(num_devices);
   if (*ret == NULL) *num_devices = 0;
   return ncclSuccess;
 }
 
 ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
-  IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list));
+  IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list));
 }
 
 const char *wrap_ibv_get_device_name(struct ibv_device *device) {
-  if (ibv_internal_get_device_name == NULL) {
+  if (ibvSymbols.ibv_internal_get_device_name == NULL) {
     WARN("lib wrapper not initialized.");
     exit(-1);
   }
-  return ibv_internal_get_device_name(device);
+  return ibvSymbols.ibv_internal_get_device_name(device);
 }
 
 ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
-  IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
+  IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
 }
 
 ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
-  IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
+  IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
 }
 
 ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
-  IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
+  IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
 }
 
 ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
-  IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
+  IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
 }
 
 ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
 }
 
 ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
 }
 
 ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
 }
 
 ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
 }
 
 ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
-  IBV_PTR_CHECK_ERRNO(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
 }
 
 ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
 }
 
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
-  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
 }
 
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
-  if (ibv_internal_reg_mr == NULL) {
+  if (ibvSymbols.ibv_internal_reg_mr == NULL) {
     WARN("lib wrapper not initialized.");
     return NULL;
   }
-  return ibv_internal_reg_mr(pd, addr, length, access);
+  return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access);
 }
 
 ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
-  if (ibv_internal_reg_mr_iova2 == NULL) {
+  if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) {
     return ncclInternalError;
   }
   if (ret == NULL) { return ncclSuccess; } // Assume dummy call
-  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
 }
 
 /* DMA-BUF support */
 ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
-  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
 }
 
 struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
-  if (ibv_internal_reg_dmabuf_mr == NULL) {
+  if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) {
     errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
     return NULL;
   }
-  return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
+  return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
 }
 
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
 }
 
 ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
-  IBV_PTR_CHECK_ERRNO(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
 }
 
 ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
 }
 
 ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
 }
 
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
-  IBV_PTR_CHECK_ERRNO(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
+  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
 }
 
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
 }
 
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
-  *ret = (char *) ibv_internal_event_type_str(event);
+  *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
   return ncclSuccess;
 }
diff --git a/src/misc/msccl/msccl_setup.cc b/src/misc/msccl/msccl_setup.cc
index b815d96fde..c8ddbe477c 100644
--- a/src/misc/msccl/msccl_setup.cc
+++ b/src/misc/msccl/msccl_setup.cc
@@ -106,7 +106,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
   proxyOp.pattern = 0;
   proxyOp.root = 0;
   proxyOp.nbytes = status.stepSize*proxyOp.sliceSteps;
-  proxyOp.opCount = comm->collOpCount;
+  proxyOp.opCount = comm->sharedRes->collOpCount;
   int nLoops = (int)(DIVUP(status.nBytes, (size_t)((size_t)hostAlgo->nChunksPerLoop*(size_t)status.chunkEffectiveSize)));
   int nLoopsChunkSteps = nLoops * status.chunkSteps;
   for (int ch = 0; ch < hostAlgo->nChannels; ch++) {
@@ -123,7 +123,7 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
       }
       proxyOp.nsteps = nLoopsChunkSteps * nRecvs;
       if (proxyOp.nsteps > 0) {
-        NCCLCHECK(mscclSaveProxy(ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0));
+        NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxyRecv, recvPeer->peer, &proxyOp, 0));
       }
     }
     for (int i=0; i<mscclChannel->nSendPeers; i++){
@@ -136,12 +136,12 @@ ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
       }
       proxyOp.nsteps = nLoopsChunkSteps * nSends;
       if (proxyOp.nsteps > 0) {
-        NCCLCHECK(mscclSaveProxy(ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0));
+        NCCLCHECK(mscclSaveProxy(comm, ncclChannel, proxySend, sendPeer->peer, &proxyOp, 0));
       }
     }
   }
   NCCLCHECK(ncclProxyStart(comm));
-  comm->collOpCount++;
+  comm->sharedRes->collOpCount++;
   return ncclSuccess;
 }
 
diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc
index e32038955d..71ba5b3ab8 100644
--- a/src/misc/rocmwrap.cc
+++ b/src/misc/rocmwrap.cc
@@ -170,4 +170,6 @@ error:
   return ncclSystemError;
 }
 
-
+int ncclCuMemEnable() {
+  return 0;
+}
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index 69f7b1bde0..ce05c3ef3e 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <utils.h>
 
 struct shmHandleInternal {
   int fd;
@@ -31,7 +32,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
   handle->devShmPtr = dptr;
   handle->shmSize = shmSize;
   handle->realShmSize = realShmSize;
-  handle->refcount = (int*)(hptr + shmSize);
+  handle->refcount = (hptr != NULL) ? (int*)(hptr + shmSize) : NULL;
   if (create) {
     int slen = strlen(shmPath);
     handle->shmPath = (char*)malloc(slen + 1);
@@ -80,23 +81,20 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
   if (hptr == MAP_FAILED) {
     WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
     ret = ncclSystemError;
+    hptr = NULL;
     goto fail;
   }
 
   if (create) {
     *(int*)(hptr + shmSize) = refcount;
   } else {
-    int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
+    int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
     if (remref == 0) {
       /* the last peer has completed attachment, it should unlink the shm mem file. */
       if (unlink(shmPath) != 0) {
         WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
       }
     }
-
-    if (refcount != -1) {
-      WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
-    }
   }
 
   if (devShmPtr) {
@@ -128,13 +126,13 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
   if (tmphandle) {
     if (tmphandle->fd >= 0) {
       close(tmphandle->fd);
-      if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) {
+      if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
         if (unlink(tmphandle->shmPath) != 0) {
           WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
           ret = ncclSystemError;
         }
-        free(tmphandle->shmPath);
       }
+      free(tmphandle->shmPath);
     }
 
     if (tmphandle->shmPtr) {
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 6d934c4bd6..612498c884 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -419,7 +419,7 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   if (sock->fd != -1) {
     sock->state = ncclSocketStateAccepted;
   } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
-    WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
+    WARN("socketTryAccept: Accept failed: %s", strerror(errno));
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -429,6 +429,9 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
   uint64_t magic;
   enum ncclSocketType type;
   int received = 0;
+  const int one = 1;
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
   NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
   if (received == 0) return ncclSuccess;
   NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 6047b2f21d..e4ebb92a21 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -23,7 +23,6 @@
 #define RCCL_BFLOAT16 1
 #define RCCL_GATHER_SCATTER 1
 #define RCCL_ALLTOALLV 1
-#define RCCL_MULTIRANKPERGPU 1
 
 #ifdef __cplusplus
 extern "C" {
@@ -50,6 +49,7 @@ typedef enum { ncclSuccess                 =  0,
 
 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
 
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
@@ -64,6 +64,7 @@ typedef struct ncclConfig_v21700 {
   int minCTAs;
   int maxCTAs;
   const char *netName;
+  int splitShare;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -76,7 +77,8 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
   NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
   NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
-  NCCL_CONFIG_UNDEF_PTR                     /* netName */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }
 
 /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
@@ -131,28 +133,6 @@ ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 /// @endcond
 
-/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device.
-
-    @details
-    rank must be between 0 and nranks-1 and unique within a communicator clique.
-    Each rank is associated to a HIP device, which has to be set before calling
-    ncclCommInitRankMulti.
-    Since this version of the function allows multiple ranks to utilize the same
-    HIP device, a unique virtualId per device has to be provided by each calling
-    rank.
-    ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be
-    called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
-
-    @param[in]
-    comm        ncclComm_t*
-                communicator struct pointer
-    */
-  ncclResult_t  ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
-/// @cond include_hidden
-  ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
-/// @endcond
-
-
 /*! @brief Creates a clique of communicators (single process version).
  *
  * @details This is a convenience function to create a single-process communicator clique.
@@ -191,6 +171,19 @@ ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 /// @endcond
 
+/*! @brief Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/// @cond include_hidden
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/// @endcond
+
+/* Returns a string for each error code. */
 /*! @brief Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 /// @cond include_hidden
diff --git a/src/net.cc b/src/net.cc
index d31a000202..2524d9c753 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -265,10 +265,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
 
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
   // Initialize main communication network
-  char* netName;
+  const char* netName;
   bool ok = false;
 
-  netName = comm->netName;
+  netName = comm->config.netName;
   for (int i=0; i<3; i++) {
     if (ncclNets[i] == nullptr) continue;
     enum ncclNetState state;
@@ -309,27 +309,31 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     return ncclSuccess;
   }
 #endif
-  int netDevs;
-  NCCLCHECK(ncclNetDevices(comm, &netDevs));
-  *gdrSupport = 0;
-  for (int dev=0; dev<netDevs; dev++) {
-    // Find a net device which is GDR-capable
-    ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
-    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+  static int gdrSupportMatrix[32] = {
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  if (gdrSupportMatrix[comm->cudaDev] == -1) {
+    int netDevs;
+    NCCLCHECK(comm->ncclNet->devices(&netDevs));
+    gdrSupportMatrix[comm->cudaDev] = 0;
+    for (int dev=0; dev<netDevs; dev++) {
+      // Find a net device which is GDR-capable
+      ncclNetProperties_t props;
+      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
+      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    *gdrSupport = 1;
-    break;
+      gdrSupportMatrix[comm->cudaDev] = 1;
+      break;
 #endif
 
     // Allocate memory on the GPU and try to register it on the NIC.
     void *lComm = NULL, *sComm = NULL, *rComm = NULL;
     ncclNetHandle_t handle;
-    void* gpuPtr = NULL;
+    char* gpuPtr = NULL;
     void* mHandle = NULL;
     ncclResult_t ret;
     ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
 
     bool connected;
     connected = false;
@@ -341,32 +345,34 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
       }
 
       if (sComm == NULL)
-        NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
 
       if (rComm == NULL)
-        NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
 
       connected = (rComm != NULL) && (sComm != NULL);
     }
 
-    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
-      *gdrSupport = 1;
+    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
+    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
+      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+      gdrSupportMatrix[comm->cudaDev] = 1;
     }
     ncclDebugNoWarn = 0;
-    CUDACHECK(cudaFree(gpuPtr));
+    NCCLCHECK(ncclCudaFree(gpuPtr));
 cleanup2:
     if (rComm != NULL)
-      NCCLCHECK(ncclNetCloseRecv(comm, rComm));
+      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
     if (sComm != NULL)
-      NCCLCHECK(ncclNetCloseSend(comm, sComm));
-    NCCLCHECK(ncclNetCloseListen(comm, lComm));
+      NCCLCHECK(comm->ncclNet->closeSend(sComm));
+    NCCLCHECK(comm->ncclNet->closeListen(lComm));
 cleanup1:
-    break;
+      break;
+    }
   }
+  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
   return ncclSuccess;
 }
 
diff --git a/src/proxy.cc b/src/proxy.cc
index 74551365cd..c4b63ed346 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -65,6 +65,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
       }
 
       memcpy(elem->respBuff, respBuff, respSize);
+      free(respBuff);
       elem->done = true;
       return ncclSuccess;
     }
@@ -75,7 +76,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
   return ncclInternalError;
 }
 
-static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) {
+static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize) {
   struct ncclExpectedProxyResponse* ex;
   NCCLCHECK(ncclCalloc(&ex, 1));
   ex->opId = opId;
@@ -84,10 +85,6 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
   ex->respBuff = malloc(respSize);
   ex->respSize = respSize;
   ex->done     = false;
-  if (respData) {
-    memcpy(ex->respBuff, respData, respDataSize);
-    ex->done = true;
-  }
 
   // Enqueue
   struct ncclExpectedProxyResponse* list = state->expectedResponses;
@@ -440,10 +437,11 @@ ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextO
   return ncclSuccess;
 }
 
-ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) {
-  struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps;
+static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) {
+  int tpLocalRank = comm->topParentLocalRanks[comm->localRank];
+  struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps;
   if (proxyOps == NULL) return ncclInternalError;
-  proxyOps += proxyConn->localRank;
+  proxyOps += proxyConn->tpLocalRank;
   struct ncclProxyOpsPool* pool = proxyOps->pool;
 
   TIME_START(0);
@@ -454,9 +452,9 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
     proxyOps->freeOp = op->next;
   } else {
     int freeOp;
-    while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield();
+    while ((freeOp = pool->freeOps[tpLocalRank]) == -1) sched_yield();
     int freeOpNew;
-    while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
+    while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+tpLocalRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
     opIndex = freeOp;
     op = pool->ops+opIndex;
     proxyOps->freeOp = op->next;
@@ -501,13 +499,13 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
   return ncclSuccess;
 }
 
-static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
+static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
   if (peer < 0) return ncclSuccess;
 
-  struct ncclChannelPeer* peerComm = channel->peers+peer;
+  struct ncclChannelPeer* peerComm = channel->peers[peer];
   struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
   if (connector->transportComm == NULL) {
-    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
+    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank,
         type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
     return ncclInternalError;
   }
@@ -515,13 +513,13 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
 
   if (justInquire) *justInquire = true;
   else {
-    NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+    NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op));
   }
   return ncclSuccess;
 }
 
-ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
-  NCCLCHECK(SaveProxy(channel, type, peer, op, connIndex, nullptr));
+ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
+  NCCLCHECK(SaveProxy(comm, channel, type, peer, op, connIndex, nullptr));
   return ncclSuccess;
 }
 
@@ -537,10 +535,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
   case ncclPatternPipelineTo: {
       struct ncclRing* ring = &channel->ring;
       if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
-        NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
+        NCCLCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
       }
       if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
-        NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex, justInquire));
+        NCCLCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire));
       }
     } break;
   case ncclPatternTreeUp:
@@ -549,30 +547,42 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       if (op->pattern != ncclPatternTreeDown) { // Tree up
         struct ncclTree* tree = &channel->tree;
         for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
-          NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
+          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->down[i], op, 0, justInquire));
         }
-        NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
+        NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire));
       }
       if (op->pattern != ncclPatternTreeUp) { // Tree down
         struct ncclTree* tree = &channel->tree;
         for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
-          NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
+          NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire));
         }
-        NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
+        NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire));
       }
     } break;
   case ncclPatternCollnetChain: {
-      NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
-      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
     } break;
   case ncclPatternCollnetDirect: {
-      NCCLCHECK(SaveProxy(channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
-      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
+    } break;
+  case ncclPatternNvls: {
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire));
+    } break;
+  case ncclPatternNvlsTree: {
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
     } break;
   case ncclPatternSend:
   case ncclPatternRecv: {
       if (op->root == comm->rank) return ncclSuccess;
-      NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
+      NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
     } break;
   }
   return ncclSuccess;
@@ -596,7 +606,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
   info->chunkSize = stepSize;
   op->root = info->root;
 
-  struct ncclChannelPeer* peer = channel->peers + op->root;
+  struct ncclChannelPeer* peer = channel->peers[op->root];
   if (info->coll == ncclFuncSend) {
     op->pattern = ncclPatternSend;
     if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
@@ -665,13 +675,13 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr
   return ncclSuccess;
 }
 
-static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
+static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
   struct ncclProxyArgs* prevOp = NULL;
   struct ncclProxyArgs* op = opStart;
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
     TIME_START(0); TIME_START(1);
-    NCCLCHECK(op->progress(comm, op));
+    NCCLCHECK(op->progress(proxyState, op));
     if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
     if (op->state == ncclProxyOpNone) {
@@ -688,8 +698,8 @@ static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressS
 
 NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
 
-static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int* added) {
+  struct ncclProxyProgressState* state = &proxyState->progressState;
   if (state->opsPool == NULL) return ncclInternalError;
   struct ncclProxyOpsPool* pool = state->opsPool;
 
@@ -724,7 +734,7 @@ process_nextops:
   TIME_START(2);
   int freeOp[NCCL_MAX_LOCAL_RANKS];
   int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
-  for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
+  for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1;
 
   uint64_t lastOpCount = 0;
   int lastPeer = -1;
@@ -752,7 +762,7 @@ process_nextops:
     state->nextOps = opIndex;
   }
 
-  for (int i=0; i<comm->localRanks; i++) {
+  for (int i = 0; i < proxyState->tpLocalnRanks; i++) {
     if (freeOp[i] == -1) continue;
     int newFree = freeOp[i];
     int oldFree = pool->freeOps[i];
@@ -784,7 +794,7 @@ void ncclDumpProxyState(int signal) {
 }
 
 NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
-ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
+static int setProxyThreadContext(struct ncclProxyState* proxyState) {
 #if CUDART_VERSION >= 11030
   static int createThreadContext = -1;
 
@@ -798,44 +808,44 @@ ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
     }
   }
   if (createThreadContext) {
-    if (comm->proxyState.cudaCtx == NULL) {
-      if (CUPFN(cuCtxCreate(&comm->proxyState.cudaCtx,
-                                  CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
-        WARN("Failed to create CUDA context on device %d", comm->cudaDev);
+    if (proxyState->cudaCtx == NULL) {
+      if (CUPFN(cuCtxCreate(&proxyState->cudaCtx,
+                                  CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
+        WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
         createThreadContext = 0;
-        return ncclSuccess;
       }
     } else {
-      if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
-        WARN("Failed to set CUDA context on device %d", comm->cudaDev);
-        return ncclUnhandledCudaError;
+      if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
+        WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
+        return 0;
       }
+      return 1;
     }
   }
 #endif
-  return ncclSuccess;
+  return 0;
 }
 
 // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
 NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
 NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
 
-void* ncclProxyProgress(void *comm_) {
-  struct ncclComm* comm = (struct ncclComm*)comm_;
-  if (ncclSetThreadContext(comm) != ncclSuccess) {
-    WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
-  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
-    WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
+void* ncclProxyProgress(void *proxyState_) {
+  struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
+  if (setProxyThreadContext(proxyState)) {
+    INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
+  } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
   }
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
 
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  struct ncclProxyProgressState* state = &proxyState->progressState;
   state->nextOps = -1;
   const int sig = ncclParamProxyDumpSignal();
   if (sig != -1) signal(sig, ncclDumpProxyState);
   ncclLastProxyState = state;
   char threadName[NCCL_THREAD_NAMELEN];
-  snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
+  snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", proxyState->cudaDev);
   nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
 
   int lastIdle = 0;
@@ -846,11 +856,10 @@ void* ncclProxyProgress(void *comm_) {
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
   struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while ((state->stop == false || (state->stop == true && state->active)) && *comm->abortFlag == 0) {
+  while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
     int idle = 1;
-    ncclResult_t ret = progressOps(comm, state, state->active, &idle);
+    ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
-      (void) ncclCommSetAsyncError(comm, ret);
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
@@ -861,10 +870,9 @@ void* ncclProxyProgress(void *comm_) {
       proxyOpAppendCounter = 0;
       TIME_START(3);
       if (state->stop == false)
-        ret = ncclProxyGetPostedOps(comm, &added);
+        ret = ncclProxyGetPostedOps(proxyState, &added);
       if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
       if (ret != ncclSuccess) {
-        (void) ncclCommSetAsyncError(comm, ret);
         INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       }
       if (added == 0) {
@@ -877,11 +885,11 @@ void* ncclProxyProgress(void *comm_) {
 }
 
 ncclResult_t ncclProxyStart(struct ncclComm* comm) {
-  struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps;
+  struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps;
   if (proxyOps == NULL) return ncclSuccess;
   TIME_START(1);
-  for (int r=0; r<comm->localRanks; r++) {
-    struct ncclProxyOps* ops = proxyOps+r;
+  for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+    struct ncclProxyOps* ops = proxyOps + r;
     if (ops->pool == NULL || ops->nextOps == -1) continue;
     NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
     ops->nextOps = ops->nextOpsEnd = -1;
@@ -892,17 +900,17 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
+  struct ncclProxyProgressState* state = &proxyState->progressState;
   if (!state->thread) {
-    pthread_create(&state->thread, NULL, ncclProxyProgress, comm);
-    ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev);
+    pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
+    ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
+  struct ncclProxyProgressState* state = &proxyState->progressState;
 
   // Request the proxy to stop and then wake it
   if (state->opsPool) {
@@ -954,26 +962,26 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
   return ncclSuccess;
 }
 
-static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   if (connection->send) {
     if (ncclTransports[connection->transport]->send.proxyFree) {
-      NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
+      NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, proxyState));
     }
   } else {
     if (ncclTransports[connection->transport]->recv.proxyFree) {
-      NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
+      NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, proxyState));
     }
   }
   return ncclSuccess;
 }
 
-static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) {
+static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclProxyState* proxyState) {
   for (int b=0; b<pool->banks; b++) {
     int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
     for (int i=0; i<max; i++) {
       ncclProxyConnection *connection = pool->pools[b]+i;
       if (connection->state != connUninitialized) {
-        NCCLCHECK(proxyFree(connection, comm));
+        NCCLCHECK(proxyFree(connection, proxyState));
       }
     }
     free(pool->pools[b]);
@@ -984,122 +992,155 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
 
 #include "transport.h"
 
-ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
+struct ncclProxyInitReq {
+  int transport;
+  int send;
+  int tpLocalRank;
+  int tpRank;
+  int sameProcess;
+};
+
+struct ncclProxyInitResp {
+  ncclProxyConnection* connection;
+  char devShmPath[6]; // "XXXXXX" - May or may not be set
+};
+
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) {
   struct ncclSocket* sock;
-  int ready;
-  int type = ncclProxyMsgInit;
+  int ready, proxyRank = -1;
+  struct ncclProxyState* sharedProxyState = comm->proxyState;
 
   // Keep one connection per mlocal rank
+  for (int i = 0; i < comm->localRanks; ++i) {
+    /* find the proxy rank in comm. */
+    if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
+      proxyRank = comm->localRankToRank[i];
+      break;
+    }
+  }
+  proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  // Keep one connection per local rank
   proxyConn->connection = NULL;
-  proxyConn->rank = rank;
-  if (comm->proxyState.peerSocks == NULL) {
-    NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
-    NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
-    NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
-    for (int i = 0; i < comm->localRanks; ++i) {
-      NCCLCHECK(ncclSocketSetFd(-1, &comm->proxyState.peerSocks[i]));
+  proxyConn->tpRank = tpProxyRank;
+  if (sharedProxyState->peerSocks == NULL) {
+    NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
+    NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
+    NCCLCHECK(ncclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks));
+    for (int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) {
+      NCCLCHECK(ncclSocketSetFd(-1, &sharedProxyState->peerSocks[i]));
     }
   }
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
-  sock = comm->proxyState.peerSocks + proxyConn->localRank;
+  proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank];
+  sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
   NCCLCHECK(ncclSocketReady(sock, &ready));
   if (!ready) {
-    NCCLCHECK(ncclSocketInit(sock, comm->proxyState.peerAddresses+rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
+    NCCLCHECK(ncclSocketInit(sock, sharedProxyState->peerAddresses+proxyConn->tpRank, comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
     NCCLCHECK(ncclSocketConnect(sock));
   }
-  NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
-  NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
-  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
-  NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
-  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
+
+  struct ncclProxyInitReq req = {0};
+  req.transport = transport;
+  req.send = send;
+  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
+  req.tpRank = comm->topParentRanks[comm->rank];
+  req.sameProcess = proxyConn->sameProcess;
+
+  struct ncclProxyInitResp resp = {0};
+  // This usually sends proxyConn->connection to identify which connection this is.
+  // However, this is part of the response and therefore is ignored
+  NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp)));
+  proxyConn->connection = resp.connection;
+
   // If we need proxy progress, map progress ops
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
   if (tcomm->proxyProgress) {
     char poolPath[] = "/dev/shm/nccl-XXXXXX";
-    NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
-    struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
+    strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1);
+    struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
     if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle));
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
-  INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection);
-  proxyConn->comm = comm;
+  INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
   return ncclSuccess;
 }
 
+// cuMem API support
+// The response is sent out-of-band using ncclIpcSocket for this specific command
+ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) {
+  ncclResult_t ret = ncclSuccess;
+  ncclResult_t res = ncclInProgress;
+  struct ncclIpcSocket ipcSock = { 0 };
+  void* opId = malloc(1);
+  // Create a UDS socket to receive the converted fd
+  NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
+
+  // Request the conversion of the fd over sockets
+  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
+
+  // Receive converted fd over UDS
+  NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd));
+  TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+
+  while (res == ncclInProgress) {
+    res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
+  }
+
+  free(opId);
+  return res;
+
+error:
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+  WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
+  return ret;
+}
+
 const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
-ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
+ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
   struct ncclSocket* sock;
   ncclResult_t ret = ncclSuccess;
-  void* respData = NULL;
-  int respDataSize = 0;
-  struct ncclComm* comm = proxyConn->comm;
-  struct ncclIpcSocket ipcSock = { 0 };
+  struct ncclProxyState* sharedProxyState = comm->proxyState;
 
-  if (*comm->abortFlag != 0) {
-    WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response");
-    return ncclInternalError;
-  }
-  if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
 
-  sock = comm->proxyState.peerSocks + proxyConn->localRank;
+  sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
   if (sock == NULL) return ncclInternalError;
 
-  if (type == ncclProxyMsgConvertFd) {
-    // cuMem API support
-    // Create a UDS socket to receive the converted fd
-    NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag));
-  }
-
   NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
   NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
   NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
   NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
   if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
 
-  if (type == ncclProxyMsgConvertFd) {
-    // cuMem API support
-    int recvFd = -1;
-    if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
-    // Receive converted fd over UDS
-    NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd));
-    TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd);
-    assert(recvFd != -1);
-    respData = &recvFd;
-    respDataSize = sizeof(recvFd);
-    NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  } else {
-    // Send opId to proxy
-    NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
-  }
+  // Send opId to proxy
+  NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
+
   // Add proxyOp to expected response queue
-  NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize));
+  NCCLCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize));
 
   return ncclSuccess;
 error:
-  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
   return ret;
 }
 
-ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
-  struct ncclComm* comm = proxyConn->comm;
-
+ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
+  struct ncclProxyState* sharedProxyState = comm->proxyState;
   // Receive the connection pointer from the Proxy
   if (*comm->abortFlag) {
     WARN("Comm %p is in abort state", comm);
     return ncclInternalError;
   }
-  if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
 
   // Check response queue
   int found = 0;
-  NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found));
+  NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
   if (found == 0) {
     // Attempt to read in a new response header from the proxy thread
-    struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank;
+    struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
 
     void* recvOpId;
     int offset = 0;
@@ -1116,7 +1157,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r
         NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
     }
 
-    INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId);
+    INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId);
 
     // Now do a blocking recv of the response size
     int respSize = 0;
@@ -1124,17 +1165,22 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r
 
     // If there's a respSize to recv
     if (respSize > 0) {
+      if (recvOpId != opId) {
+        // Unexpected response, need to buffer the socket data
+        respBuff = malloc(respSize);
+      }
+      assert(respBuff != NULL);
       NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
     }
 
     if (recvOpId == opId) {
       INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
-      NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId));
+      NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
       return ncclSuccess;
     } else {
-      INFO(NCCL_PROXY, "Queing opId=%p", recvOpId);
+      INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
       // Store the result and mark response as completed
-      NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize));
+      NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
       return ncclInProgress;
     }
   } else {
@@ -1144,38 +1190,37 @@ ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* r
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
   // Alloc some memory to act as a handle
+  ncclResult_t res = ncclSuccess;
   void* opId = malloc(1);
 
-  NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId));
-  ncclResult_t res = ncclInProgress;
+  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
 
-  while (res == ncclInProgress) {
-    res = ncclPollProxyResponse(proxyConn, respBuff, opId);
-  }
+  do {
+    res = ncclPollProxyResponse(comm, proxyConn, respBuff, opId);
+  } while (res == ncclInProgress);
 
+exit:
   free(opId);
-
   return res;
+fail:
+  goto exit;
 }
 
-static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
+  struct ncclProxyProgressState* state = &proxyState->progressState;
   if (state->opsPool == NULL) {
     int size = sizeof(struct ncclProxyOpsPool);
     struct ncclProxyOpsPool* pool = NULL;
 
-    // The service thread may be launched already but localRanks may not be set yet.
-    while (comm->localRanks == 0) sched_yield();
-
     char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
     shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, comm->localRanks + 1, &state->handle));
+    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle));
     // Init pool
     pool->nextOps = -1;
 
-    for (int r=0; r<comm->localRanks; r++) {
+    for (int r = 0; r < proxyState->tpLocalnRanks; r++) {
       pool->freeOps[r] = r*MAX_OPS_PER_PEER;
       for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
       pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1;
@@ -1194,20 +1239,20 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
     memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
 
     // All ops structures are created, we can start the progress thread
-    NCCLCHECK(ncclProxyProgressCreate(comm));
+    NCCLCHECK(ncclProxyProgressCreate(proxyState));
   }
   return ncclSuccess;
 }
 
-static void proxyOpsFree(struct ncclComm* comm) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+static void proxyOpsFree(struct ncclProxyState* proxyState) {
+  struct ncclProxyProgressState* state = &proxyState->progressState;
   if (ncclShmClose(state->handle) != ncclSuccess) {
     WARN("[Service thread] shm close failed");
   }
 }
 
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
-  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  struct ncclProxyProgressState* state = &comm->proxyState->progressState;
   if (state->opsPool == NULL) return ncclSuccess;
 
   if (ncclShmUnlink(state->handle) != ncclSuccess) {
@@ -1216,97 +1261,75 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
-  struct ncclSocket* sock = &peer->sock;
+static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, ncclProxyInitReq* req, ncclProxyInitResp* resp, struct ncclProxyConnection** connection) {
   int id;
-  struct ncclProxyConnection* connection;
   NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
-  NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection));
-  connection->sock = sock;
-  NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
-  connection->localRank = peer->localRank;
-  NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
-  connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
+  NCCLCHECK(ncclProxyGetConnection(connectionPool, id, connection));
+
+  (*connection)->sock = &peer->sock;
+  (*connection)->transport = req->transport;
+  (*connection)->send = req->send;
+  (*connection)->tpLocalRank = req->tpLocalRank;
+  (*connection)->sameProcess = req->sameProcess;
+  peer->tpLocalRank = req->tpLocalRank;
+  peer->tpRank = req->tpRank;
+
+  resp->connection = *connection;
+
+  (*connection)->tcomm = (*connection)->send ? &ncclTransports[(*connection)->transport]->send : &ncclTransports[(*connection)->transport]->recv;
   // If we need proxy progress, let's allocate ops and start the thread
-  if (connection->tcomm->proxyProgress) {
-    NCCLCHECK(proxyProgressInit(comm));
-    struct ncclProxyProgressState* state = &comm->proxyState.progressState;
-    NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
+  if ((*connection)->tcomm->proxyProgress) {
+    NCCLCHECK(proxyProgressInit(proxyState));
+    struct ncclProxyProgressState* state = &proxyState->progressState;
+    strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath));
   }
-  INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
-  __atomic_store_n(&connection->state, connInitialized, __ATOMIC_RELEASE);
-  return ncclSuccess;
-}
-
-static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
-  struct ncclSocket* sock = &peer->sock;
-  struct ncclProxyConnection* connection;
-  NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*)));
-  int reqSize, respSize;
-  NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
-  if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
-  int nChannels;
-  NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
-
-  // Store opId for completion response
-  void* opId;
-  NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId)));
-  INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId);
-
-  if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
-  __atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
-
-  // Send the opId for referencing async operation
-  INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId);
-  NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId)));
-
-  // Send the response size
-  INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize);
-  NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize)));
-
+  INFO(NCCL_NET|NCCL_PROXY, "New proxy %s connection %d from local rank %d, transport %d", (*connection)->send ? "send":"recv", id, (*connection)->tpLocalRank, (*connection)->transport);
+  __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE);
   return ncclSuccess;
 }
 
 // cuMem API support
-static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) {
-  struct ncclSocket* sock = &peer->sock;
-  uint64_t connection;
-  NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t)));
-  int reqSize, respSize;
-  NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
-  if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
-
-  int fd;
+static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) {
   struct ncclIpcSocket ipcSock = { 0 };
-  NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int)));
+  uint64_t hash = (uint64_t) opId;
 
-  INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection);
+  INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
   // Send back the converted fd using UDS
-  NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag));
-  NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection));
+  NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag));
+  NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
   return ncclSuccess;
 }
 
-static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) {
+static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) {
   int done = 1;
   if (op->type == ncclProxyMsgSetup) {
-    INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
-    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
+    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
   } else if (op->type == ncclProxyMsgConnect) {
-    INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
-    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
+    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+  } else if (op->type == ncclProxyMsgSharedInit) {
+    int nChannels = (int) *op->reqBuff;
+    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
+    if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
+    __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
+  } else if (op->type == ncclProxyMsgConvertFd) {
+    int fd = *(int *)op->reqBuff;
+    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
+    NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
+  } else if (op->type == ncclProxyMsgInit) {
+    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
+    NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection));
   } else return ncclInternalError;
+
   if (done) {
+    INFO(NCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize);
     if (op->type == ncclProxyMsgSetup)
       __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
     else if (op->type == ncclProxyMsgConnect)
       __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
-    /* if setup or connect is done, we should not return any error at this point since 
+    /* if setup or connect is done, we should not return any error at this point since
      * ncclSocketSend might already send the respBuff to the requester. If we still choose
      * to abort and close the connection, it can cause segfault if the requester is using
      * the respBuff. */
@@ -1326,14 +1349,14 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
     (*asyncOpCount)--;
     return ncclSuccess;
 
-  } else if (*comm->abortFlag != 0) {
+  } else if (*proxyState->abortFlag != 0) {
     return ncclInternalError;
   }
 
   return ncclInProgress;
 }
 
-static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
+static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) {
   struct ncclSocket* sock = &peer->sock;
   struct ncclProxyAsyncOp* asyncOp;
   NCCLCHECK(ncclCalloc(&asyncOp, 1));
@@ -1356,21 +1379,34 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
   asyncProxyOpEnqueue(peer, asyncOp);
 
   (*asyncOpCount)++;
-  NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer));
+  NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
   return ncclSuccess;
 }
 
 #include <poll.h>
 
-void* ncclProxyService(void* _args) {
-  struct ncclComm* comm =  (struct ncclComm *) _args;
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  if (ncclSetThreadContext(comm) != ncclSuccess) {
-    WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
-  } else if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
-    WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
+static bool proxyMatchOpType(int type) {
+  switch (type) {
+    case ncclProxyMsgInit:
+    case ncclProxyMsgSharedInit:
+    case ncclProxyMsgSetup:
+    case ncclProxyMsgConnect:
+    case ncclProxyMsgConvertFd:
+      return true;
+    default:
+      return false;
   }
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+}
+
+void* ncclProxyService(void* _args) {
+  struct ncclProxyState* proxyState =  (struct ncclProxyState*) _args;
+  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (setProxyThreadContext(proxyState)) {
+    INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev);
+  } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev);
+  }
+  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
 
   // Prepare poll descriptor
   struct ncclProxyConnectionPool connectionPool;
@@ -1385,7 +1421,7 @@ void* ncclProxyService(void* _args) {
     pollfds[s].fd = -1;
     pollfds[s].events = POLLHUP|POLLIN;
   }
-  if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
+  if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
     WARN("[Proxy Service] Get listenSock fd fails");
     return NULL;
   };
@@ -1399,7 +1435,7 @@ void* ncclProxyService(void* _args) {
     /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
      * connections. Need to wait until all other related comms call abort and safely exit
      * together, or we could face segmentation fault. */
-    if (*comm->abortFlag != 0) stop = 1;
+    if (*proxyState->abortFlag != 0) stop = 1;
     /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
     int ret;
     do {
@@ -1421,7 +1457,7 @@ void* ncclProxyService(void* _args) {
         WARN("[Service thread] Initialize peers[%d].sock fails", s);
         return NULL;
       }
-      if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
+      if (ncclSocketAccept(&peers[s].sock, proxyState->listenSock) != ncclSuccess) {
         WARN("[Service thread] Accept failed %s", strerror(errno));
       } else {
         if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
@@ -1429,7 +1465,7 @@ void* ncclProxyService(void* _args) {
           return NULL;
         }
         npeers++;
-        peers[s].localRank = -1;
+        peers[s].tpLocalRank = -1;
       }
     }
     for (int s=0; s<maxnpeers; s++) {
@@ -1443,10 +1479,11 @@ void* ncclProxyService(void* _args) {
       // Progress all ops for this ncclProxyLocalPeer
       ncclProxyAsyncOp* op = peer->asyncOps;
       while (op != nullptr) {
+        ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
         type = op->type;
-        res = proxyProgressAsync(op, comm, &asyncOpCount, peer);
+        res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
         if (res == ncclSuccess || res == ncclInProgress) {
-          op = op->next;
+          op = opnext;
         } else {
           // Res is a bad result
           closeConn = 1;
@@ -1460,10 +1497,10 @@ void* ncclProxyService(void* _args) {
         int closed;
         res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
         if (res != ncclSuccess && res != ncclInProgress) {
-          WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed);
+          WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
           closeConn = 1;
         } else if (closed) {
-          INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
+          INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
           closeConn = 1;
         } else if (res == ncclSuccess) { // We received something from the sock
           if (type == ncclProxyMsgStop) {
@@ -1471,17 +1508,10 @@ void* ncclProxyService(void* _args) {
             closeConn = 1;
           } else if (type == ncclProxyMsgClose) {
             closeConn = 1;
-          } else if (type == ncclProxyMsgInit) {
-            res = proxyConnInit(peers+s, &connectionPool, comm);
-          } else if (type == ncclProxyMsgSharedInit) {
-            res = proxyConnSharedInit(peers+s, &connectionPool, comm);
-          } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
-            INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank);
-            res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
-          } else if (type == ncclProxyMsgConvertFd) {
-            res = proxyConvertFd(peers+s, comm); // cuMem API support
+          } else if (proxyMatchOpType(type)) {
+            res = proxyServiceInitOp(type, peers+s, &connectionPool, proxyState, &asyncOpCount);
           } else {
-            WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank);
+            WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank);
             closeConn = 1;
           }
 
@@ -1491,7 +1521,7 @@ void* ncclProxyService(void* _args) {
         closeConn = 1;
       }
       if (res != ncclSuccess && res != ncclInProgress) {
-        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
+        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
         closeConn = 1;
       }
 
@@ -1509,67 +1539,106 @@ void* ncclProxyService(void* _args) {
   }
 
   // Wait for all operations to complete and stop progress thread before freeing any resource
-  if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
+  if (ncclProxyProgressDestroy(proxyState) != ncclSuccess) {
     WARN("[Proxy Service] proxyDestroy failed");
   }
   for (int s=0; s<maxnpeers; s++) {
     ncclSocketClose(&peers[s].sock);
   }
-  ncclProxyFreeConnections(&connectionPool, comm);
-  ncclSocketClose(comm->proxyState.listenSock);
-  proxyOpsFree(comm);
+  ncclProxyFreeConnections(&connectionPool, proxyState);
+  ncclSocketClose(proxyState->listenSock);
+  free(proxyState->listenSock);
+  proxyOpsFree(proxyState);
   return NULL;
 }
 
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
-  comm->proxyState.listenSock = sock;
-  comm->proxyState.peerAddresses = peerAddresses;
+  assert(comm->sharedRes->proxyState == NULL);
+  NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
+  comm->proxyState = comm->sharedRes->proxyState;
+  comm->proxyState->refCount = 1;
+  comm->proxyState->listenSock = sock;
+  comm->proxyState->peerAddresses = peerAddresses;
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
-  // comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
-  pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
-  ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
+  /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is
+   * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */
+  struct ncclProxyState* proxyState = comm->proxyState;
+  if (proxyState->refCount == 1) {
+    /* we have to make sure all following fields in comm have been initialized. */
+    proxyState->tpRank = comm->rank;
+    proxyState->tpnRanks = comm->nRanks;
+    proxyState->tpLocalnRanks = comm->localRanks;
+    proxyState->cudaDev = comm->cudaDev;
+    proxyState->abortFlag = comm->abortFlag;
+    proxyState->p2pnChannels = comm->p2pnChannels;
+    proxyState->p2pChunkSize = comm->p2pChunkSize;
+    proxyState->nChannels = comm->nChannels;
+    proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers;
+    proxyState->dmaBufSupport = comm->dmaBufSupport;
+    proxyState->ncclNet = comm->ncclNet;
+    proxyState->ncclCollNet = comm->ncclCollNet;
+    memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
+
+    pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
+    ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProxyStop(struct ncclComm* comm) {
+  if (comm->sharedRes && comm->sharedRes->proxyState) {
+    struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
+
+    if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
+      if (sharedProxyState->peerAddresses) {
+        if (*comm->abortFlag == 0) {
+          struct ncclSocket sock;
+          int type = ncclProxyMsgStop;
+          NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
+          NCCLCHECK(ncclSocketConnect(&sock));
+          NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
+          NCCLCHECK(ncclSocketClose(&sock));
+        }
+      }
+
+      if (sharedProxyState->peerSocks) {
+        int tplocalRanks = comm->sharedRes->tpNLocalRanks;
+        for (int i = 0; i < tplocalRanks; i++) {
+          int fd;
+          NCCLCHECK(ncclSocketGetFd(sharedProxyState->peerSocks + i, &fd));
+          if (fd >= 0) {
+            if (sharedProxyState->proxyOps[i].pool) {
+              NCCLCHECK(ncclShmClose(sharedProxyState->proxyOps[i].handle));
+            }
+            if (sharedProxyState->sharedDevMems[i]) {
+              if (!ncclCuMemEnable()) {
+                CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i]));
+              }
+            }
+            int type = ncclProxyMsgClose;
+            if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
+            NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
+          }
+        }
+      }
+    }
+  }
+
   return ncclSuccess;
 }
 
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
-  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
 
-  if (state == NULL) return ncclSuccess;
-  if (state->peerAddresses) {
-    if (*comm->abortFlag == 0) {
-      struct ncclSocket sock;
-      int type = ncclProxyMsgStop;
-      NCCLCHECK(ncclSocketInit(&sock, comm->proxyState.peerAddresses + comm->rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
-      NCCLCHECK(ncclSocketConnect(&sock));
-      NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
-      NCCLCHECK(ncclSocketClose(&sock));
-    }
-    free(state->peerAddresses);
-  }
-
-  if (state->peerSocks) {
-    for (int i=0; i<comm->localRanks; i++) {
-      int fd;
-      NCCLCHECK(ncclSocketGetFd(state->peerSocks + i, &fd));
-      if (fd >= 0) {
-        if (state->proxyOps[i].pool) {
-          NCCLCHECK(ncclShmClose(state->proxyOps[i].handle));
-        }
-        if (state->sharedDevMems[i]) {
-          CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
-        }
-        int type = ncclProxyMsgClose;
-        if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks + i, &type, sizeof(int)));
-        NCCLCHECK(ncclSocketClose(state->peerSocks + i));
-      }
-    }
-    free(state->peerSocks);
-    free(state->proxyOps);
-    free(state->sharedDevMems);
-    expectedProxyResponseFree(state);
-  }
+  assert(sharedProxyState->refCount == 0);
+  free(sharedProxyState->peerAddresses);
+  free(sharedProxyState->peerSocks);
+  free(sharedProxyState->proxyOps);
+  free(sharedProxyState->sharedDevMems);
+  expectedProxyResponseFree(sharedProxyState);
+  free(sharedProxyState);
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index f1c30fa01b..896082059b 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -22,8 +22,8 @@ template <int type>
 static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
   struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
   struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
-  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
-                                                  comm->channels[channelId].peers[peer].recv + connIndex;
+  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex :
+                                                  comm->channels[channelId].peers[peer]->recv + connIndex;
   // handle intra-node network connections
   int n1 = -1, n2 = -1;
   if (connIndex == NCCL_CONN_IDX_P2P_NET) {
@@ -57,12 +57,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n
   uint64_t mask = 1UL << channel->id;
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue;
     comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
   }
   for (int i=0; i<nsend; i++) {
     int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue;
     comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
   }
   return ncclSuccess;
@@ -85,7 +85,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
   struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
   // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
     int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@@ -154,13 +154,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
       for (int c=0; c<MAXCHANNELS; c++) {
           TIME_START(3);
           if (sendMask & (1UL<<c)) {
-            struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
+            struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
             // This connector hasn't completed connection yet
             if (conn->connected == 0) {
               NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
               if (ret == ncclSuccess) {
+                struct ncclDevChannelPeer* addr;
                 conn->connected = 1;
-                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
+                /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
+                CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
               } else if (ret == ncclInProgress) {
                 allChannelsConnected = false;
               }
@@ -171,13 +174,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
           // Start with recv channels
           TIME_START(4);
           if (recvMask & (1UL<<c)) {
-            struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
+            struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
             // This connector hasn't completed connection yet
             if (conn->connected == 0) {
               NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
               if (ret == ncclSuccess) {
+                struct ncclDevChannelPeer* addr;
                 conn->connected = 1;
-                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
+                /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
+                CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
               } else if (ret == ncclInProgress) {
                 allChannelsConnected = false;
               }
@@ -203,8 +209,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   if (highestTransportType != NULL) *highestTransportType = highestType;
   TIME_PRINT("P2P Setup/Connect");
 exit:
-  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
+  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
   return ret;
 fail:
   goto exit;
@@ -238,7 +244,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   }
 
   // select
-  struct ncclChannelPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers[nranks];
   // connector index: 0 for recv, 1 for send
   struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
   struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
@@ -277,8 +283,9 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   // connect
   if (isMaster) {
     NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
+    struct ncclDevChannelPeer* devRoot;
+    CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
     CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
   }
   // recv side sends connect info to send side
@@ -317,16 +324,20 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
   // Free collNet resources
   for (int r=0; r<comm->nChannels; r++) {
     struct ncclChannel* channel = comm->channels+r;
-    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      struct ncclConnector* send = peer->send + b;
-      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
-      send->transportResources = NULL; // avoid double free
-    }
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      struct ncclConnector* recv = peer->recv + b;
-      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
-      recv->transportResources = NULL; // avoid double free
+    struct ncclChannelPeer* peer = channel->peers[comm->nRanks];
+    if (peer) {
+      if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          struct ncclConnector* send = peer->send + b;
+          if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
+          send->transportResources = NULL; // avoid double free
+        }
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          struct ncclConnector* recv = peer->recv + b;
+          if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
+          recv->transportResources = NULL; // avoid double free
+        }
+      }
     }
   }
   return ncclSuccess;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 7a5e012e02..a3eb18f3a3 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -144,24 +144,26 @@ struct setupReq {
   int netDev;
   int useGdr;
   int needFlush;
+  struct ncclCollNetSharedRes* collNet;
 };
 
 
 /* Setup send connector, and return connect information for others in the coll
  * communicator to connect to me */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct setupReq req;
+  struct setupReq req = { 0 };
 
-  int proxyRank;
+  int proxyRank, tpProxyRank;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
-  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
+  tpProxyRank = comm->topParentRanks[myInfo->rank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
+  ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
+  req.collNet = comm->collNetSharedRes;
+  NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -169,17 +171,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 }
 
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct setupReq req;
+  struct setupReq req = { 0 };
 
-  int proxyRank;
+  int proxyRank, tpProxyRank;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  // Determine whether we need to flush the GDR buffer on recv or not
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
+  tpProxyRank = comm->topParentRanks[myInfo->rank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
   struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
-  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
+  ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
+  req.collNet = comm->collNetSharedRes;
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
   INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
       req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -224,7 +231,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
   // We're on the same process as the proxy. We can pass a pointer to a struct.
   struct collNetConnectArgs args = { rank, nranks, connectInfos };
   struct connectMap* map;
-  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
 
   // If collnet connect failed, propagate error to fallback on regular p2p
   if (map == NULL) return ncclSystemError;
@@ -250,7 +257,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
   // We're on the same process as the proxy. We can pass a pointer to a struct.
   struct collNetConnectArgs args = { rank, nranks, connectInfos };
   struct connectMap* map;
-  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
 
   // If collnet connect failed, propagate error to fallback on regular p2p
   if (map == NULL) return ncclSystemError;
@@ -279,7 +286,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct setupReq* req = (struct setupReq*)reqBuff;
   if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
@@ -291,9 +298,10 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->netDev = req->netDev;
   resources->useGdr = req->useGdr;
   ncclNetProperties_t props;
-  NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
+  NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
+  connection->collNet = req->collNet;
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   return ncclSuccess;
 }
 
@@ -303,19 +311,19 @@ struct sharedResources {
   int commRefCount[NCCL_MAX_NETDEVS];
 };
 
-ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
-  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) {
+  struct sharedResources* resources = (struct sharedResources*)collNet->resources;
   if (resources == NULL) {
     NCCLCHECK(ncclCalloc(&resources, 1));
-    comm->proxyState.progressState.collNet.resources = resources;
+    collNet->resources = resources;
   }
   if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
+    NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev));
   return ncclSuccess;
 }
 
-static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
-  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) {
+  struct sharedResources* resources = (struct sharedResources*)collNet->resources;
   if (resources->collNetComms[netDev] == NULL) {
     // Connect to coll comm
     collNetHandle_t** handlePtrs = NULL;
@@ -324,13 +332,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
       struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
       handlePtrs[i] = &(info->collNetHandle);
     }
-    ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
+    ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank,
           resources->collNetListenComms[netDev],
           resources->collNetComms+netDev);
     free(handlePtrs);
     if (ret == ncclSuccess) {
       // Close listen comm
-      NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
+      NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev]));
     } else {
       resources->collNetListenComms[netDev] = NULL;
     }
@@ -340,55 +348,53 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
   return ncclSuccess;
 }
 
-static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
-  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) {
+  struct sharedResources* resources = (struct sharedResources*)collNet->resources;
   resources->commRefCount[netDev]--;
   if (resources->commRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
+    NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev]));
   }
   for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
-  comm->proxyState.progressState.collNet.resources = NULL;
+  collNet->resources = NULL;
   free(resources);
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
-  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
-  if (state->size == 0) {
-    state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
+static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
+  if (collNet->size == 0) {
+    collNet->size = 2 * collNet->nChannels * collNet->buffSize;
   }
 
-  *size = state->size;
+  *size = collNet->size;
 
-  if (cuda && state->cudaBuff == NULL) {
-    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, comm->sideStream, cuda));
+  if (cuda && collNet->cudaBuff == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda));
   }
-  if (!cuda && state->hostBuff == NULL) {
-    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+  if (!cuda && collNet->hostBuff == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
   }
-  *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+  *gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff;
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
+static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) {
   // Use different pools for different channels and also separate send/recv.
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
+  int slotSize = collNet->buffSize / NCCL_STEPS;
+  int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel;
   *offset = slotSize * globalSlot;
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
-  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
-  if (state->size == 0) return ncclSuccess;
-  CUDACHECK(cudaFree(state->cudaBuff));
-  NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) {
+  if (collNet->size == 0) return ncclSuccess;
+  NCCLCHECK(ncclCudaFree(collNet->cudaBuff));
+  NCCLCHECK(ncclCudaHostFree(collNet->hostBuff));
   // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
-  state->size = 0;
+  collNet->size = 0;
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct setupReq* req = (struct setupReq*)reqBuff;
   if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
 
@@ -401,18 +407,19 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->useGdr = req->useGdr;
   resources->needFlush = req->needFlush;
   ncclNetProperties_t props;
-  NCCLCHECK(collNetGetProperties(comm, req->netDev, &props));
+  NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
+  connection->collNet = req->collNet;
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
 
   collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
   if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
 
-  NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
+  NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle));
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
   struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@@ -426,7 +433,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     resources->recvMhandles[p] = info->mhandles[p];
 
-  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+  NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
 
   // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
   if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@@ -434,7 +441,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
     *((struct connectMap**)respBuff) = NULL;
     return ncclSuccess;
   }
-  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
+  connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev;
 
   struct connectMap* map = &resources->map;
 
@@ -445,7 +452,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
     uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
 
     resources->gdcSync = cpuPtr;
     struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -462,7 +469,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   // Allocate & Register shared buffers for the Simple protocol
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
-  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
 #if CUDA_VERSION >= 11070
@@ -470,23 +477,23 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   if (resources->useGdr && resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                 &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                  &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
   {
-    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-                           &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                            resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                                            &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
   }
 
   *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
@@ -494,7 +501,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
   resources->collNetRank = args->rank;
 
-  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+  NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm));
 
   // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
   if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
@@ -502,7 +509,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
     *((struct connectMap**)respBuff) = NULL;
     return ncclSuccess;
   }
-  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
+  connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1;
 
   struct connectMap* map = &resources->map;
 
@@ -513,7 +520,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   if (ncclGdrCopy) {
     uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
 
     if (ncclParamGdrCopySyncEnable()) {
       resources->gdcSync = cpuPtr;
@@ -531,7 +538,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   // Allocate & Register shared buffers for the Simple protocol
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
-  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
 #if CUDA_VERSION >= 11070
@@ -539,16 +546,16 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   if (resources->useGdr && resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                 &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                  &resources->mhandles[NCCL_PROTO_SIMPLE]));
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
   {
-    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-                           &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                            resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                                            &resources->mhandles[NCCL_PROTO_SIMPLE]));
   }
 
   // Pass info to send side
@@ -561,41 +568,43 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
 
   if (resources) {
     for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
       if (resources->sendMhandles[p]) {
-        NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
+        NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
     NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-    NCCLCHECK(sharedBuffersDestroy(comm));
-    NCCLCHECK(sharedFree(comm, resources->netDev));
+    NCCLCHECK(sharedBuffersDestroy(connection->collNet));
+    NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
+    if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
     free(connection->transportResources);
   }
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
 
   if (resources) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       if (resources->mhandles[p]) {
-        NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
     NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-    NCCLCHECK(sharedBuffersDestroy(comm));
-    NCCLCHECK(sharedFree(comm, resources->netDev));
+    NCCLCHECK(sharedBuffersDestroy(connection->collNet));
+    NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev));
+    if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet);
     free(connection->transportResources);
   }
   return ncclSuccess;
@@ -605,7 +614,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 #define LAST_OF_GROUP(s) \
   (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
 
-static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -633,7 +642,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         int sharedBuffSlot = sub->posted%NCCL_STEPS;
         int offset;
-        NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
         resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
         __sync_synchronize();
         volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -654,7 +663,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           int ready = 1;
           if (s == 0) {
             int offset;
-            NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+            NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
             args->sharedBuff[sharedBuffSlot] = localBuff + offset;
             args->sharedSize[sharedBuffSlot] = args->chunkSize;
           }
@@ -680,7 +689,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
           reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
           char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
-          NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
+          NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
           if (sub->requests[buffSlot] == NULL) continue;
 
           TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@@ -696,7 +705,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int done, size;
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
+        NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
         if (done) {
           TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
           // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@@ -720,7 +729,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -751,7 +760,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int sharedBuffSlot = sub->posted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
         int offset;
-        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
         reqFifo[group][buffSlot].recvBuff = localBuff + offset;
         TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
         sub->posted += args->sliceSteps;
@@ -782,8 +791,8 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
             } else {
               int startChannel = group*COLLNET_GROUP_NSUBS;
               int offset;
-              NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
-              NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+              NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
+              NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
             }
           } else {
             for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -797,7 +806,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
         int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
+        if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
         if (done) {
           TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
           for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -811,7 +820,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
         int offset;
-        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
         volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
         offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
         __sync_synchronize();
diff --git a/src/transport/net.cc b/src/transport/net.cc
index a8fafcc10f..748be8ca42 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -13,6 +13,7 @@
 #include "collectives.h"
 #include "gdrwrap.h"
 #include "shm.h"
+#include "p2p.h"
 #include "profiler.h"
 #include "graph.h"
 #include "graph/topo.h"
@@ -67,10 +68,8 @@ struct connectMapMem{
   char* gpuPtr;
   char* cpuPtr;
   int size;
-  union {
-    char shmPath[PATH_MAX];
-    cudaIpcMemHandle_t ipc;
-  };
+  ncclIpcDesc ipcDesc;
+  char shmPath[PATH_MAX];
   ncclShmHandle_t attachHandle;
   ncclShmHandle_t createHandle;
 };
@@ -95,9 +94,9 @@ struct sendResources {
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
 
-  int rank;
-  int localRank;
-  int remoteRank;
+  int tpRank;
+  int tpLocalRank;
+  int tpRemoteRank;
   int netDev;
   int useGdr;
   int useDmaBuf;
@@ -122,10 +121,10 @@ struct recvResources {
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
 
-  int rank;
-  int localRank;
-  int remoteRank;
-  int proxyRank;
+  int tpRank;
+  int tpLocalRank;
+  int tpRemoteRank;
+  int tpRemoteProxyRank;
   int netDev;
   int useGdr;
   int useDmaBuf;
@@ -162,9 +161,9 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
 NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
 
 struct setupReq {
-  int rank;
-  int localRank;
-  int remoteRank;
+  int tpRank;
+  int tpLocalRank;
+  int tpRemoteRank;
   int shared;
   int netDev;
   int useGdr;
@@ -177,7 +176,8 @@ struct setupReq {
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct setupReq req;
+  struct setupReq req = { 0 };
+  int localRank, tpProxyRank;
 
   send->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
@@ -195,20 +195,22 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
     send->conn.curr_hdp_reg = req.curr_hdp_reg;
   }
 
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
-  req.rank = myInfo->rank;
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
-  req.remoteRank = peerInfo->rank;
-  NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+  tpProxyRank = comm->topParentRanks[proxyRank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
+  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpRank = comm->topParentRanks[myInfo->rank];
+  req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
+  NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
         req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
   }
-  *((int*)connectInfo) = proxyRank;
+  *((int*)connectInfo) = tpProxyRank;
   return ncclSuccess;
 }
 
@@ -219,7 +221,8 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 
 /* Setup recv connector */
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct setupReq req;
+  struct setupReq req = { 0 };
+  int localRank;
 
   recv->conn.shared = req.shared = (graph || mscclAvailable() && mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
@@ -227,7 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.netDev = -1;
 
   // Use myInfo->rank as the receiver uses its own NIC
-  int proxyRank = myInfo->rank;
+  int proxyRank = myInfo->rank, tpProxyRank;
   if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev));
   if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
@@ -236,13 +239,15 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
   // We don't support PXN on receive yet
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
+  tpProxyRank = comm->topParentRanks[myInfo->rank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
 
-  req.rank = myInfo->rank;
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
-  req.remoteRank = peerInfo->rank;
-  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
+  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpRank = comm->topParentRanks[myInfo->rank];
+  req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
   return ncclSuccess;
 }
@@ -297,39 +302,47 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     send->transportResources = map;
     opId = send;
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
-    NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
+    NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
   }
 
   ncclResult_t ret;
-  NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
+  NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId));
   if (ret == ncclInProgress) {
     return ret;
   }
   INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
 
-  if (map->sameProcess) {
+  if (map->sameProcess && !ncclCuMemEnable()) {
     if (map->cudaDev != comm->cudaDev) {
-      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
-        return ncclInternalError;
+      if (!ncclCuMemEnable()) {
+        // Enable P2P access for Legacy IPC
+        cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
+        if (err == cudaErrorPeerAccessAlreadyEnabled) {
+          cudaGetLastError();
+        } else if (err != cudaSuccess) {
+          WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
+          return ncclInternalError;
+        }
       }
     }
-  } else {
-    NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+  } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
+    if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
-      CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+      NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
+                                             map->mems[NCCL_NET_MAP_DEVMEM].size,
+                                             &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
+                                             (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
     }
     if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
-      void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
+      void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
       if (*sharedDevMemPtr == NULL) {
-        CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+        NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
+                                               map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
+                                               &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
+                                               sharedDevMemPtr));
       }
       map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
       map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
@@ -363,13 +376,13 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
     opId = recv;
     INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
        opId, &recv->proxyConn, connectInfo);
-    NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
+    NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
   } else {
     opId = recv;
   }
 
   ncclResult_t ret;
-  NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
+  NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId));
   if (ret == ncclInProgress) {
     return ret;
   }
@@ -394,10 +407,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
 static ncclResult_t sendFree(struct ncclConnector* send) {
   struct connectMap* map = (struct connectMap*)(send->transportResources);
   if (map) {
-    if (map->sameProcess == 0) {
-      NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
+    int cudaDev;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    if (map->sameProcess && map->cudaDev == cudaDev) {
+      // Our own GPU, so it wasn't mapped in
+      free(map);
+      return ncclSuccess;
+    }
+    if (!map->sameProcess || ncclCuMemEnable()) {
+      if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
       if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
-        CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+        if (ncclCuMemEnable()) {
+          // cuMem API support
+          NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
+          NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+        } else {
+          // Legacy CUDA IPC support
+          CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+        }
       }
     }
     free(map);
@@ -412,86 +439,87 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
 }
 
 #define NCCL_SHARED_STEPS 16
-static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
-    int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) {
+static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess,
+    int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) {
   if (cuda == 0 && sameProcess == 0) {
       WARN("PXN should not use host buffers for data");
       return ncclInternalError;
   }
-  struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+  struct ncclProxyProgressState* progressState = &proxyState->progressState;
   if (progressState->localPeers == NULL) {
-    NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+    NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
   }
   struct ncclProxyPeer** localPeers = progressState->localPeers;
-  if (localPeers[localRank] == NULL) {
-    NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
+  if (localPeers[tpLocalRank] == NULL) {
+    NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1));
   }
-  struct ncclProxyPeer* peer = localPeers[localRank];
+  struct ncclProxyPeer* peer = localPeers[tpLocalRank];
   struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
   state->refcount++;
   if (state->size == 0) {
-    state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
+    state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize;
   }
 
   if (size) *size = state->size;
 
   if (cuda && state->cudaBuff == NULL) {
-    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, comm->sideStream, cuda));
-    if (sameProcess == 0) {
-      CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff));
+    if (sameProcess == 0 || ncclCuMemEnable()) {
+      NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
+    } else {
+      NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda));
     }
   }
   if (!cuda && state->hostBuff == NULL) {
     NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
   }
   if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
-  if (sameProcess) {
-    if (gpuPtr) *gpuPtr = *cpuPtr;
-  } else {
-    if (gpuPtr) *gpuPtr = NULL;
-    if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t));
-  }
+  if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
+  if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
+static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
   // Use different pools for different channels and also separate send/recv.
   int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
-  *offset = comm->p2pChunkSize * globalSlot;
+  *offset = proxyState->p2pChunkSize * globalSlot;
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
-  if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
-  struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
+static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
+  if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
+  struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
   if (peer == NULL) NCCLCHECK(ncclInternalError;)
   struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
   if (state->size == 0) NCCLCHECK(ncclInternalError);
-  state->refcount--;
-  if (state->refcount == 0) {
-    if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff));
+  if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
+    if (state->cudaBuff) {
+      if (!connection->sameProcess || ncclCuMemEnable()) {
+        NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc));
+      }
+      NCCLCHECK(ncclCudaFree(state->cudaBuff));
+    }
     if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
   }
+
   if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
+
   free(peer);
-  comm->proxyState.progressState.localPeers[localRank] = NULL;
-  for (int r=0; r<comm->localRanks; r++) {
-    if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
+  proxyState->progressState.localPeers[tpLocalRank] = NULL;
+  for (int r = 0; r < proxyState->tpLocalnRanks; r++) {
+    if (proxyState->progressState.localPeers[r]) return ncclSuccess;
   }
   // All peers are freed, free array
-  free(comm->proxyState.progressState.localPeers);
-  comm->proxyState.progressState.localPeers = NULL;
+  free(proxyState->progressState.localPeers);
+  proxyState->progressState.localPeers = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
-  int rank = comm->localRankToRank[connection->localRank];
-  int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
-  NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
+static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) {
+  NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL));
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct setupReq* req = (struct setupReq*) reqBuff;
   if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
@@ -499,9 +527,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
-  resources->rank = req->rank;
-  resources->localRank = req->localRank;
-  resources->remoteRank = req->remoteRank;
+  resources->tpRank = req->tpRank;
+  resources->tpLocalRank = req->tpLocalRank;
+  resources->tpRemoteRank = req->tpRemoteRank;
   resources->netDev = req->netDev;
   resources->shared = connection->shared = req->shared;
   resources->useGdr = req->useGdr;
@@ -509,9 +537,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->connIndex = req->connIndex;
   resources->curr_hdp_reg = req->curr_hdp_reg;
   ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
+  NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
 
   // We don't return any data
@@ -520,7 +548,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct setupReq* req = (struct setupReq*) reqBuff;
   if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
@@ -528,9 +556,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
-  resources->rank = req->rank;
-  resources->localRank = req->localRank;
-  resources->remoteRank = req->remoteRank;
+  resources->tpRank = req->tpRank;
+  resources->tpLocalRank = req->tpLocalRank;
+  resources->tpRemoteRank = req->tpRemoteRank;
   resources->netDev = req->netDev;
   resources->shared = connection->shared = req->shared;
   resources->useGdr = req->useGdr;
@@ -538,50 +566,50 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->channelId = req->channelId;
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
+  NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
   *done = 1;
 
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
   if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
   ncclResult_t ret = ncclSuccess;
 
   if (resources->shared) {
     // Shared buffers
-    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    struct ncclProxyProgressState* progressState = &proxyState->progressState;
     if (progressState->localPeers == NULL) {
-      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
     }
     struct ncclProxyPeer** localPeers = progressState->localPeers;
-    if (localPeers[resources->localRank] == NULL) {
-      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    if (localPeers[resources->tpLocalRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
     }
-    connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
+    connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId;
 
     if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
       // Connect or reuse connection for a netdev/remote rank.
       if (progressState->netComms[resources->netDev] == NULL) {
-        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+        NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
-      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
+      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
+      ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
     }
   } else {
     // Connect to remote peer
-    ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
+    ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -594,28 +622,27 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
   // Create structures
   struct connectMap* map = &resources->map;
-  map->sameProcess =
-    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  map->sameProcess = connection->sameProcess;
   map->shared = resources->shared;
   CUDACHECK(cudaGetDevice(&map->cudaDev));
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
-      resources->buffSizes[p] = comm->buffSizes[p];
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
     // Get shared buffers
     int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
     struct connectMapMem* mapMem = map->mems+bank;
     NCCLCHECK(sharedBuffersInit(
-          comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
-          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
+          proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels,
+          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
 
-    if (comm->allocP2pNetLLBuffers) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
-      resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
+    if (proxyState->allocP2pNetLLBuffers) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+      resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
     }
 
     NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@@ -626,15 +653,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
   if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
     if (resources->shared == 0) {
-      if (!map->sameProcess) {
+      if (!map->sameProcess || ncclCuMemEnable()) {
         ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
+        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
+                                                 (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+      } else {
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
       }
-      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr));
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
     }
-    if (!map->sameProcess) {
-      CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
-    }
   }
   if (map->sameProcess) {
     NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
@@ -644,7 +671,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   }
   if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
     uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
 
     resources->gdcSync = cpuPtr;
     struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -669,24 +696,24 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
 #else
       /* DMA-BUF support */
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
+      if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
         int dmabuf_fd;
         uint64_t offset;
         CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
-        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
         INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
           (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
       } else // FALL-THROUGH to nv_peermem GDR path
 #endif
       {
-        NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
     }
   }
@@ -697,40 +724,40 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(int)) return ncclInternalError;
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
-  resources->proxyRank = *(int*)reqBuff;
+  resources->tpRemoteProxyRank = *(int*)reqBuff;
   ncclResult_t ret = ncclSuccess;
 
   // Finish connection establishment from remote peer
   if (resources->shared) {
     // Shared buffers
-    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    struct ncclProxyProgressState* progressState = &proxyState->progressState;
     if (progressState->localPeers == NULL) {
-      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks));
     }
     struct ncclProxyPeer** localPeers = progressState->localPeers;
-    if (localPeers[resources->localRank] == NULL) {
-      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    if (localPeers[resources->tpLocalRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1));
     }
-    connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
+    connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId;
 
     if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
       // Connect or reuse connection for a netdev/remote rank.
       if (progressState->netComms[resources->netDev] == NULL) {
-        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+        NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
-      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
+      if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId);
       resources->netRecvComm = comms->recvComm[resources->channelId];
       if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
     } else {
-      ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
+      ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
     }
   } else {
     // Connect to remote peer
-    ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
+    ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -741,26 +768,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   }
   *done = 1;
 
-  NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
+  NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm));
 
   // Create structures
   struct connectMap* map = &resources->map;
-  map->sameProcess =
-    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  map->sameProcess = connection->sameProcess;
   if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
   map->shared = resources->shared;
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
-      resources->buffSizes[p] = comm->buffSizes[p];
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
     // Get shared buffers
     int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
     struct connectMapMem* mapMem = map->mems+bank;
     NCCLCHECK(sharedBuffersInit(
-          comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
+          proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
     NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
@@ -769,14 +795,19 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
-  if (comm->allocP2pNetLLBuffers) {
-    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, comm->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
-    resources->buffSizes[NCCL_PROTO_LL] = comm->buffSizes[NCCL_PROTO_LL];
+  if (proxyState->allocP2pNetLLBuffers) {
+    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+    resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
   }
 
   if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
     if (resources->shared == 0) {
-      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, comm->sideStream, resources->useGdr));
+      if (ncclCuMemEnable()) {
+        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
+                                                 (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+      } else {
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
+      }
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
     }
   }
@@ -784,7 +815,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   if (ncclGdrCopy && map->sameProcess) {
     uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
 
     if (ncclParamGdrCopySyncEnable()) {
       resources->gdcSync = cpuPtr;
@@ -807,24 +838,24 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
 #else
       /* DMA-BUF support */
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
+      if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
         int dmabuf_fd;
         uint64_t offset;
         CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
-        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
         INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
           (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
       } else // FALL-THROUGH to nv_peermem GDR path
 #endif
       {
-        NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
     }
   }
@@ -835,17 +866,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
   if (connection->state == connSharedInitialized) { // NVB Preconnect
-    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
+    NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection));
     return ncclSuccess;
   }
 
   if (connection->state == connConnected) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       if (resources->buffers[p]) {
-        NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
@@ -854,19 +885,25 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
     } else {
       NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
     }
-    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (!resources->map.sameProcess || ncclCuMemEnable()) {
+      // cuMem API support
+      if (mems[NCCL_NET_MAP_DEVMEM].size) {
+        NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
+      }
+    }
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
     if (resources->shared) {
-      NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
+      NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection));
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
-        struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
+        struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank;
         comms->sendRefCount[resources->channelId]--;
-        if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
+        if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId]));
       } else {
-        NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
+        NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
       }
     } else {
-      NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
+      NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
     }
   }
 
@@ -874,37 +911,43 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
   if (connection->state == connSharedInitialized) { // NVB Preconnect
-    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
+    NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection));
     return ncclSuccess;
   }
 
   if (connection->state == connConnected) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       if (resources->buffers[p]) {
-        NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
+        NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
     NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (!resources->map.sameProcess || ncclCuMemEnable()) {
+      // cuMem API support
+      if (mems[NCCL_NET_MAP_DEVMEM].size) {
+        NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
+      }
+    }
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
     if (resources->shared) {
-      NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
+      NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection));
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
-        struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
+        struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank;
         comms->recvRefCount[resources->channelId]--;
-        if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
+        if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId]));
       } else {
-        NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
+        NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
       }
     } else {
-      NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
+      NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
     }
   }
-  
+
   if (resources) free(resources);
   return ncclSuccess;
 }
@@ -915,12 +958,10 @@ static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to c
 static int g_npkit_net_poll_cnt = 0;
 #endif
 
-static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-
+static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
   g_npkit_net_poll_cnt++;
 #endif
-
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -952,7 +993,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         if (resources->shared) {
           int sharedBuffSlot = sub->posted%maxDepth;
           int offset;
-          NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
+          NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
           resources->recvMem->offsFifo[buffSlot] = offset;
           __sync_synchronize();
           volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -1010,7 +1051,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
               *resources->curr_hdp_reg = 1;
             }
             // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
@@ -1044,7 +1085,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
       if (sub->done < sub->transmitted) {
         int done;
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
+        NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
         if (done) {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
@@ -1086,12 +1127,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
   return ncclSuccess;
 }
 
-static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-
+static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
   g_npkit_net_poll_cnt++;
 #endif
-
   if (args->state == ncclProxyOpReady) {
     // Initialize subs and group them by same recvComm.
     void* recvComm;
@@ -1151,7 +1190,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           if (p == NCCL_PROTO_SIMPLE && resources->shared) {
             int sharedBuffSlot = sub->posted%maxDepth;
             int offset;
-            NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
+            NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
             volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
             offsFifo[buffSlot] = offset;
             ptrs[subCount] = localBuff+offset;
@@ -1160,7 +1199,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
           }
           sizes[subCount] = stepSize*args->sliceSteps;
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
-          tags[subCount] = resources->remoteRank;
+          tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = resources->mhandles[p];
           subCount++;
         }
@@ -1169,7 +1208,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         uint64_t step = subGroup->posted;
         struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
-        NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
         if (*requestPtr) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
@@ -1207,7 +1246,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         int sizes[NCCL_PROXY_MAX_SUBS];
         void* mhandles[NCCL_PROXY_MAX_SUBS];
         for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
-        NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        NCCLCHECK(proxyState->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes));
         if (done) {
           int needFlush = 0;
           int totalSize = 0;
@@ -1264,7 +1303,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
                 }
               }
               struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
-              NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
+              NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
           }
           args->idle = 0;
@@ -1279,7 +1318,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
         uint64_t step = subGroup->transmitted;
         int done = 1;
         void* request = subGroup->requests[step%NCCL_STEPS];
-        if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
+        if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL));
         if (done) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index a01f391133..195372e81e 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -106,6 +106,7 @@ static void* ncclIbAsyncThreadMain(void* args) {
 }
 
 NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
+NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
 
 static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
   char devicePath[PATH_MAX];
@@ -117,7 +118,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
     // Merge multi-port NICs into the same PCI device
     p[strlen(p)-1] = '0';
     // Also merge virtual functions (VF) into the same device
-    p[strlen(p)-3] = '0';
+    if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
     // And keep the real port aside (the ibv port is always 1 on recent cards)
     *realPort = 0;
     for (int d=0; d<ncclNIbDevs; d++) {
@@ -403,16 +404,25 @@ struct ncclIbHandle {
   struct ncclIbCommStage stage; // Used by the other side when connecting
 };
 
+// Retain local and remote RoCE addresses for error logging
+struct ncclIbGidInfo {
+  uint8_t link_layer;
+  union ibv_gid localGid;
+  union ibv_gid remoteGid;
+};
+
 #define NCCL_NET_IB_REQ_UNUSED 0
 #define NCCL_NET_IB_REQ_SEND 1
 #define NCCL_NET_IB_REQ_RECV 2
 #define NCCL_NET_IB_REQ_FLUSH 3
+const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
 
 struct ncclIbRequest {
   struct ncclIbVerbs* verbs;
   int type;
   int events;
   struct ncclSocket* sock;
+  struct ncclIbGidInfo* gidInfo;
   int nreqs;
   union {
     struct {
@@ -462,8 +472,10 @@ struct ncclIbSendComm {
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
+  int qpIndex;
   struct ibv_mr* fifoMr;
   int ar;
+  struct ncclIbGidInfo gidInfo;
 };
 // The SendFifo needs to be 32-byte aligned and each element needs
 // to be a 32-byte multiple, so that an entry does not get split and
@@ -496,7 +508,9 @@ struct ncclIbRecvComm {
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
+  int qpIndex;
   struct ncclIbGpuFlush gpuFlush;
+  struct ncclIbGidInfo gidInfo;
 };
 static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
 
@@ -682,15 +696,14 @@ ib_connect_check:
 
   // RoCE support
   qpInfo.lid = portAttr.lid;
-  qpInfo.link_layer = portAttr.link_layer;
+  qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer;
   if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
     for (int q=0; q<comm->nqps; q++)
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
-    union ibv_gid gid;
-    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
-    qpInfo.spn = gid.global.subnet_prefix;
-    qpInfo.iid = gid.global.interface_id;
+    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
+    qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix;
+    qpInfo.iid = comm->gidInfo.localGid.global.interface_id;
     for (int q=0; q<comm->nqps; q++)
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
@@ -716,6 +729,8 @@ ib_connect:
 
   memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
 
+  comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
+  comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
     NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
@@ -777,6 +792,9 @@ ib_recv:
   /* copy back the received info */
   memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
 
+  rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
+  rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
+
   // IB setup
   struct ibv_context* ctx;
   uint8_t ib_port;
@@ -784,8 +802,7 @@ ib_recv:
   ib_port = ncclIbDevs[lComm->dev].port;
   struct ibv_port_attr portAttr;
   NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
-  union ibv_gid gid;
-  NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
+  NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid));
 
   // QP Creation
   NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
@@ -812,7 +829,8 @@ ib_recv:
   if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
 
   // Allocate Flush dummy buffer for GPU Direct RDMA
-  rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
+  rComm->gpuFlush.enabled = ((ncclIbGdrSupport(lComm->dev) == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
+                             && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
   if (rComm->gpuFlush.enabled) {
     NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
     rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem;
@@ -823,8 +841,8 @@ ib_recv:
     localQpInfo.lid=portAttr.lid;
     localQpInfo.link_layer=portAttr.link_layer;
     localQpInfo.ib_port=ib_port;
-    localQpInfo.spn=gid.global.subnet_prefix;
-    localQpInfo.iid=gid.global.interface_id;
+    localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
+    localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
     localQpInfo.mtu=portAttr.active_mtu;
     NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo));
     NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
@@ -833,11 +851,11 @@ ib_recv:
   // Fill Handle
   struct ncclIbQpInfo qpInfo;
   qpInfo.lid=portAttr.lid;
-  qpInfo.link_layer=portAttr.link_layer;
+  qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer;
   qpInfo.ib_port=ib_port;
   for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
-  qpInfo.spn=gid.global.subnet_prefix;
-  qpInfo.iid=gid.global.interface_id;
+  qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
+  qpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
   qpInfo.mtu=remQpInfo.mtu;
 
   stage->state = ncclIbCommStateSend;
@@ -875,6 +893,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
       r->verbs = verbs;
       r->events = 1;
       r->sock = NULL;
+      r->gidInfo = NULL;
       *req = r;
       return ncclSuccess;
     }
@@ -979,6 +998,8 @@ returning:
   return res;
 }
 
+NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
+
 ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
@@ -1034,9 +1055,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
   // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
   const int align = 128;
-  for (int q=0; q<comm->nqps; q++) {
+  const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
+  for (int q=0; q<nqps; q++) {
     for (int r=0; r<nreqs; r++) {
-      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
+      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
       int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
       if (length <= 0) {
         comm->wrs[r].sg_list = NULL;
@@ -1048,10 +1070,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
       }
     }
     struct ibv_send_wr* bad_wr;
-    NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
+    NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr));
+    comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
 
     for (int r=0; r<nreqs; r++) {
-      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, comm->nqps), align) * align;
+      int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
       reqs[r]->send.offset += chunkSize;
       comm->sges[r].addr += chunkSize;
       comm->wrs[r].wr.rdma.remote_addr += chunkSize;
@@ -1111,7 +1134,8 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
     req->send.data = data;
     req->send.lkey = mr->lkey;
     req->send.offset = 0;
-    req->events = comm->nqps;
+    req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
+    if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
     *request = reqs[r] = req;
 
     // If this is a multi-recv, send only when all requests have matched.
@@ -1205,6 +1229,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
   req->type = NCCL_NET_IB_REQ_RECV;
   req->sock = &comm->sock;
   req->nreqs = n;
+  if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
   for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
 
   struct ibv_recv_wr wr;
@@ -1215,13 +1240,15 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
   wr.num_sge = 0;
 
   TIME_START(1);
-  for (int q=0; q<comm->nqps; q++) {
-    struct ibv_qp* qp = comm->qps[q];
+  const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
+  for (int q=0; q<nqps; q++) {
+    struct ibv_qp* qp = comm->qps[comm->qpIndex];
     struct ibv_recv_wr* bad_wr;
     NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
+    comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
   }
   TIME_STOP(1);
-  req->events = comm->nqps;
+  req->events = nqps;
 
   *request = req;
 
@@ -1292,8 +1319,16 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
         char line[SOCKET_NAME_MAXLEN+1];
         union ncclSocketAddress addr;
         ncclSocketGetAddr(r->sock, &addr);
-        WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
-             ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+        char localGidString[INET6_ADDRSTRLEN] = "";
+        char remoteGidString[INET6_ADDRSTRLEN] = "";
+        const char* localGidStr = NULL, *remoteGidStr = NULL;
+        if (r->gidInfo) {
+            localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString));
+            remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString));
+        }
+        WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s",
+            ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
+            localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString);
         return ncclRemoteError;
       }
 
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 336877ce2b..633cb04d8e 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -43,22 +43,7 @@ struct ncclTransport nvlsTransport = {
   { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
 
-#define NVLS_HANDLE_SIZE 64
-
-struct nvlsResources {
-  CUmulticastObjectProp properties;
-  CUmemAccessDesc accessDesc;
-  int dev;
-  size_t size;
-  size_t granularity;
-  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
-  char* mcBuff; // Multicast NVLS buffer address
-  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
-  char* ucBuff; // Unicast NVLS buffer address
-};
-
-
-ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
+ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) {
   CUmulticastObjectProp* prop = &resources->properties;
   memset(prop, 0, sizeof(*prop));
   prop->size = size;
@@ -81,7 +66,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* reso
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
+ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) {
   size_t size = resources->size;
 
   // Create a Multicast group
@@ -103,24 +88,13 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resour
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
+ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
   INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
   CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
-  int dev = resources->dev;
-  size_t size = resources->size;
-  INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
-
-  // Unbind physical memory from group for the given device
-  CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
-
-  return ncclSuccess;
-}
-
-ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
+ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) {
   CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
 
   INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
@@ -131,9 +105,11 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
     int fd = *(int *)shareableHandle;
     TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
     struct ncclProxyConnector proxyConn;
-    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
+    int tpProxyRank = comm->topParentRanks[rank];
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
     TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
-    NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
+    NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle));
+    fd = *(int *)shareableHandle;
     TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
     CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
   } else {
@@ -146,7 +122,20 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resou
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
+ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
+  CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
+
+  // Import and map the remote memory descriptor to the local GPU
+  if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+    // cuMem UDS support
+    int fd = *(int *)resources->shareableHandle;
+    (void) close(fd);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
   size_t size = resources->size;
   size_t granularity;
   CUdeviceptr ptr = 0;
@@ -178,7 +167,21 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resou
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
+ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
+  int dev = resources->dev;
+  size_t size = resources->size;
+  INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
+
+  // Unbind physical memory from group for the given device
+  CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
+
+  // Release the MC group resources
+  NCCLCHECK(nvlsGroupDisconnect(comm, resources));
+
+  return ncclSuccess;
+}
+
+ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
   size_t size = resources->size;
   CUdeviceptr ptr = 0;
 
@@ -196,7 +199,7 @@ ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resour
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
+ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
   size_t size;
   CUdeviceptr ptr;
   INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
@@ -224,135 +227,172 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* reso
 
 #define NVLS_MEM_ALIGN_SIZE (1 << 21)
 
+NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
 NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
 
-NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
+ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
+  comm->nvlsSupport = 0;
+  comm->nvlsChannels = 0;
+
+  int gpuCount;
+  NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
+  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
 
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
-  if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
   CUdevice dev;
   int driverVersion;
+
   if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
-  CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
+  CUCHECK(cuCtxGetDevice(&dev));
   CUDACHECK(cudaDriverGetVersion(&driverVersion));
-  comm->nvlsSupport = 0;
-  // NVLS Multicast support requires CUDA12.1 UMD + KMD
-  if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
-    CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-  }
-  INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
-  if (comm->nvlsSupport == 0) return ncclSuccess;
-
-  int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
-  int rank = comm->localRank, nranks = comm->localRanks;
-
-  for (int c=0; c<nChannels; c++) {
-    NCCLCHECK(initChannel(comm, c));
-  }
-  ncclResult_t res = ncclSuccess;
-  struct nvlsResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  comm->nvlsResources = resources;
-
-  size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
-  size_t memSize = NVLS_MEM_ALIGN_SIZE;
-  size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
-  size_t nvlsTotalSize = nvlsPerRankSize*nranks;
-
-  INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
-       comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
-
-  char* nvlsShareableHandle = NULL;
-  NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
-  NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
-  if (rank == 0) {
-    NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
-    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
+  if (ncclParamNvlsEnable() == 2) {
+    // NVLS Multicast support requires CUDA12.1 UMD + KMD
+    if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) {
+      CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+    }
   } else {
-    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
-    NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
+    comm->nvlsSupport = 1;
   }
 
-  NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
-  NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
-  // Local intra-node barrier to ensure everyone has bound their memory to the group
-  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
-  NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
+  INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
+  if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
+  return ncclSuccess;
+}
 
-  for (int c=0; c<nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    channel->nvls.nHeads = nranks;
-    for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
-    channel->nvls.down = comm->nRanks+1+comm->localRank;
-    channel->nvls.out = -1;       // Network not yet implemented.
-    channel->nvls.headRank = comm->localRank;  // Network not yet implemented.
-  }
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
+  if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
 
-  for (int r=0; r<nranks; r++) {
-    int nvlsPeer = comm->nRanks+1+r;
-    for (int c=0; c<nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      channel->nvls.up[r] = nvlsPeer;
+  int nHeads = comm->channels[0].nvls.nHeads;
+  int headRank = comm->channels[0].nvls.headRank;
 
-      char* mem = NULL;
-      struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
+  CUdevice dev;
+  CUCHECK(cuCtxGetDevice(&dev));
 
-      // Reduce UC -> MC
-      mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
-      peer->send[0].transportComm = &nvlsTransport.send;
-      peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-      peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
-      peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
-      mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
-      peer->recv[1].transportComm = &nvlsTransport.recv;
-      peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-      peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
-      peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
-      peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
+  ncclResult_t res = ncclSuccess;
+  bool nvlsShare = true;
+  if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks)
+    nvlsShare = true;
+  else
+    nvlsShare = false;
 
-      // Broadcast MC -> UC
-      mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
-      peer->recv[0].transportComm = &nvlsTransport.recv;
-      peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-      peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
-      peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
-      mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
-      peer->send[1].transportComm = &nvlsTransport.send;
-      peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-      peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
-      peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
-      peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
+  if (nvlsShare) {
+    /* reuse NVLS resources */
+    comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
+    for (int c = 0; c < comm->nvlsChannels; c++) {
+      NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
+    }
 
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
+    comm->nvlsResources = parent->nvlsResources;
+    ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
+  } else {
+    int nChannels;
+    struct ncclNvlsSharedRes* resources;
 
-      /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
-          nvlsPeer, c,
-          resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
-          resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
-          resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
-          resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
+    NCCLCHECK(ncclCalloc(&resources, 1));
+    comm->nvlsResources = resources;
+    resources->refCount = 1;
+
+    if (parent && parent->config.splitShare) {
+      /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels
+       * to make sure nvlsChannels match for each rank. */
+      comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
+    }
+
+    nChannels = resources->nChannels = comm->nvlsChannels;
+    for (int c = 0; c < nChannels; c++) {
+      NCCLCHECK(initNvlsChannel(comm, c, parent, false));
+    }
+
+    size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
+    size_t memSize = NVLS_MEM_ALIGN_SIZE;
+    size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
+    size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
+
+    INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
+      comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
+
+    char* shareableHandle = resources->shareableHandle;
+    NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
+    if (comm->localRank == 0) {
+      NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup);
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
+    } else {
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
+      NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup);
+    }
+
+    NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
+    NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
+    // Local intra-node barrier to ensure everyone has bound their memory to the group
+    NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
+    NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
+
+    for (int h = 0; h < nHeads; h++) {
+      int nvlsPeer = comm->nRanks + 1 + h;
+      for (int c = 0; c < nChannels; c++) {
+        struct ncclChannel* channel = comm->channels + c;
+        char* mem = NULL;
+        struct ncclChannelPeer* peer = channel->peers[nvlsPeer];
+
+        // Reduce UC -> MC
+        mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
+        peer->send[1].transportComm = &nvlsTransport.send;
+        peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
+        peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
+        peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
+        peer->recv[0].transportComm = &nvlsTransport.recv;
+        peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
+        peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
+        peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
+
+        // Broadcast MC -> UC
+        mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
+        peer->recv[1].transportComm = &nvlsTransport.recv;
+        peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
+        peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
+        peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
+        peer->send[0].transportComm = &nvlsTransport.send;
+        peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
+        peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
+        peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
+
+        struct ncclDevChannelPeer* addr;
+        CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+        CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+        CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+        CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+        CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+
+        /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
+            nvlsPeer, c,
+            resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize),
+            resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize),
+            resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize),
+            resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/
+      }
     }
   }
 
-  free(nvlsShareableHandle);
   return res;
 
 cleanup:
   comm->nvlsSupport = 0;
-  free(nvlsShareableHandle);
   return res;
 }
 
 ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
-  struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
+  struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources;
   if (resources == NULL) return ncclSuccess;
-  NCCLCHECK(nvlsGroupUnbind(comm, resources));
-  NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
-  free(resources);
-  comm->nvlsResources = NULL;
+
+  if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) {
+    NCCLCHECK(nvlsGroupUnbind(comm, resources));
+    NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
+    free(resources);
+    comm->nvlsResources = NULL;
+  }
   return ncclSuccess;
 }
 
@@ -362,7 +402,12 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
  * Pre CUDA 12.1 stubs
  */
 
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
+ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
+  comm->nvlsChannels = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index d73d451e01..460b4bf4e9 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -11,17 +11,21 @@
 #include "shm.h"
 #include "graph.h"
 #include "graph/topo.h"
+#include "p2p.h"
+
+enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
 
 struct ncclP2pBuff {
   void* directPtr;
-  cudaIpcMemHandle_t devIpc;
+  size_t size;
+  ncclIpcDesc ipcDesc;
 };
 
 struct p2pConnectInfo {
   int rank;
   int read;
   struct ncclP2pBuff p2pBuff;
-  // Use by CE memcpy
+  // Used by CE memcpy
   char shmName[7];
   int shmSize;
 };
@@ -31,7 +35,7 @@ struct p2pShm {
   struct ncclSendMem sendMem;
   struct ncclRecvMem recvMem;
 };
-struct p2pProxyInfo {
+struct p2pShmProxyInfo {
   // Shared memory between proxy and receiving GPU
   struct p2pShm* shm;
   struct p2pShm* devShm;
@@ -46,29 +50,33 @@ struct p2pProxyInfo {
   // Receiver buffer
   char* recvFifo;
 
-  // Used by progress only
+  // Used by CE memcpy progress only
   uint64_t step;
   cudaStream_t stream;
   cudaEvent_t events[NCCL_STEPS];
 };
 static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
 
-struct p2pSendResources {
-  struct ncclSendMem* devMem;
-  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-  void* sendMemIpc;
-  void* recvMemIpc;
-  struct p2pProxyInfo proxyInfo;
-};
-
-struct p2pRecvResources {
-  struct ncclRecvMem* devMem;
+struct p2pResources {
+  enum p2pType type;
+  union {
+    struct ncclSendMem* sendDevMem;
+    struct ncclRecvMem* recvDevMem;
+  };
   void* sendMemIpc;
   void* recvMemIpc;
+  // CE memcpy support
+  struct p2pShmProxyInfo proxyInfo;
   struct p2pShm* shm;
   struct p2pShm* devShm;
   int shmSize;
   ncclShmHandle_t handle;
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+};
+
+// cuMem API support
+struct p2pCuMemProxyInfo {
+  struct ncclP2pBuff p2pBuff;
 };
 
 #include <sys/types.h>
@@ -90,6 +98,7 @@ static int busIdToCudaDev(int64_t busId) {
   return -1;
 }
 
+// CE memcpy support
 NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
 static int useMemcpy = 0;
 static void initCeOperation();
@@ -149,14 +158,11 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     *ret = 0;
     return ncclSuccess;
   }
-  if (p2p == 0 && cudaDev1 == cudaDev2 && info1->busId == info2->busId) {
-    p2p = 1;
-  }
 
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
-  // Check that legacy IPC support is available
-  if (p2p != 0) {
+  // This will always fail when using NCCL_CUMEM_ENABLE=1
+  if (p2p != 0 && !ncclCuMemEnable()) {
     // Cached result of the legacyIPC detection
     static int legacyIPC = -1;
     if (legacyIPC >= 0) {
@@ -166,12 +172,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     // Check that legacy IPC support is available (WSL WAR)
     char *dummy;
     cudaIpcMemHandle_t ipc;
-    NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
+    NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN));
     if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
       INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
       *ret = 0;
     }
-    CUDACHECK(cudaFree(dummy));
+    NCCLCHECK(ncclCudaFree(dummy));
     legacyIPC = *ret;
     return ncclSuccess;
   }
@@ -193,6 +199,98 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
   } while (0)
 
+// cuMem API support
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
+  if (ncclCuMemEnable()) {
+#if CUDART_VERSION >= 11030
+    // cuMem API support
+    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
+    CUmemGenericAllocationHandle handle;
+
+    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
+    CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
+#else
+    return ncclInternalError;
+#endif
+  } else {
+    // Allocate a CUDA buffer and generate an IPC handle for it
+    NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true));
+    cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
+    if (res != cudaSuccess) {
+      WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+      ncclCudaFree(*ptr);
+      CUDACHECK(res);
+    }
+  }
+  INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
+  if (ncclCuMemEnable()) {
+    // cuMem API support
+    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
+
+    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+      int fd = *(int *) &ipcDesc->cuDesc.data;
+      if (fd <= 0) return ncclInternalError;
+      (void) close(fd);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
+  if (ncclCuMemEnable()) {
+#if CUDART_VERSION >= 11030
+    // cuMem API support
+    CUdeviceptr dptr = 0;
+    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
+    CUmemGenericAllocationHandle handle;
+    ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
+
+    // Import and map the remote memory descriptor to the local GPU
+    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+      // UDS fd support
+      struct ncclProxyConnector proxyConn;
+      int fd = *(int *)(&cuDesc->data);
+      int newFd = -1;
+      NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
+      NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd));
+      INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer);
+      CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type));
+      close(newFd);
+    } else {
+      CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
+    }
+    CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
+    CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0));
+
+    TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr);
+
+    // Allow access by the local GPU
+    CUmemAccessDesc accessDesc = {};
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = comm->cudaDev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
+    TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
+
+    *devMemPtr = (void *)dptr;
+#else
+    return ncclInternalError;
+#endif
+  } else {
+    // Legacy CUDA IPC
+    CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess));
+  }
+
+  INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr);
+
+  return ncclSuccess;
+}
 
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
@@ -209,10 +307,11 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
   return ncclSuccess;
 }
 
-static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
-  if (myInfo->pidHash == peerInfo->pidHash) {
+static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
+  if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
     if (peerInfo->cudaDev != myInfo->cudaDev) {
-      // Enable P2P access
+      // Same PID different GPUs, enable P2P access
+      // Legacy CUDA IPC
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
@@ -225,8 +324,15 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
     *devMem = p2pBuff->directPtr;
     *ipcPtr = NULL;
   } else {
-    CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess));
-    *ipcPtr = *devMem;
+    if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
+      // Same PID and GPU
+      *devMem = p2pBuff->directPtr;
+      *ipcPtr = NULL;
+    } else {
+      // Different PID or different GPU
+      NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
+      *ipcPtr = *devMem;
+    }
   }
   return ncclSuccess;
 }
@@ -234,7 +340,8 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
 /* Send: Create and return connect structures for this peer to connect to me */
 ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
     struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct p2pSendResources* resources;
+  struct p2pResources* resources;
+  int tpProxyRank;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
   int useRead, intermediateRank;
@@ -261,35 +368,47 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   int sendSize = sizeof(struct ncclSendMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
+  if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE];
   ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
-      if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+      resources->type = P2P_DIRECT;
+      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
           channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
     } else {
+      // cuMem API support
+      if (ncclCuMemEnable()) {
+        resources->type = P2P_CUMEM;
+        INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s comm %p nRanks %02d",
+             channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
+      } else {
+        // Legacy CUDA IPC
+        resources->type = P2P_IPC;
+        INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
+             channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
+      }
       send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
-          channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
     }
   } else {
+    resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
     INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
         channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
-	comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
+	  comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
   }
 
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
+  tpProxyRank = comm->topParentRanks[info->rank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
   if (useMemcpy) {
-    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
     info->shmSize = resources->proxyInfo.shmSize;
     memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
   } else {
-    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
-    NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
   }
 
   return ncclSuccess;
@@ -298,7 +417,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 /* Create and return connect structures for this peer to connect to me */
 ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
     struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
-  struct p2pRecvResources* resources;
+  struct p2pResources* resources;
+  int tpProxyRank;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
   int useRead, intermediateRank;
@@ -312,44 +432,56 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   int recvSize = sizeof(struct ncclRecvMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p];
   ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
-      if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+      resources->type = P2P_DIRECT;
+      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
+      if (ncclCuMemEnable()) {
+        // cuMem API support
+        resources->type = P2P_CUMEM;
+        TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM",
+              channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      } else {
+        // Legacy CUDA IPC
+        resources->type = P2P_IPC;
+      }
       recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
   } else {
+    resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
   }
 
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
-  NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  tpProxyRank = comm->topParentRanks[info->rank];
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
+  NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
   return ncclSuccess;
 }
 
 /* Connect/Send to this peer */
 static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
-  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
-  struct ncclRecvMem* remDevMem;
+  struct p2pResources* resources = (struct p2pResources*)send->transportResources;
+  struct ncclRecvMem* remDevMem = NULL;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
+  NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
 
   char* buff = (char*)(remDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
-      if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
-      send->conn.buffs[p] = (char*)(resources->devMem+1);
+      if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
+      send->conn.buffs[p] = (char*)(resources->sendDevMem+1);
     } else {
       send->conn.buffs[p] = buff;
-      buff += send->comm->buffSizes[p];
+      buff += comm->buffSizes[p];
     }
   }
 
@@ -358,20 +490,20 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
     send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
     send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
     // Send SIMPLE buff to proxy, and replace it by local buffer
-    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
     send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
   } else {
     send->conn.tail = &remDevMem->tail;
-    send->conn.head = &resources->devMem->head;
-    send->conn.ptrExchange = &resources->devMem->ptrExchange;
-    send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+    send->conn.head = &resources->sendDevMem->head;
+    send->conn.ptrExchange = &resources->sendDevMem->ptrExchange;
+    send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange;
   }
   return ncclSuccess;
 }
 
 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
-  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+  struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
   struct ncclSendMem* remDevMem = NULL;
@@ -381,20 +513,22 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
     sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
     TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
     resources->shmSize = info->shmSize;
+    // Attach to peer's SHM segment
     NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
 
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
   } else {
-    NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+    NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
 
-    recv->conn.tail = &resources->devMem->tail;
+    struct ncclRecvMem* devMem = resources->recvDevMem;
+    recv->conn.tail = &devMem->tail;
     recv->conn.head = &remDevMem->head;
     recv->conn.ptrExchange = &remDevMem->ptrExchange;
     recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
   }
 
-  char* buff = (char*)(resources->devMem+1);
+  char* buff = (char*)(resources->recvDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
@@ -402,93 +536,113 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
       recv->conn.buffs[p] = (char*)(remDevMem+1);
     } else {
       recv->conn.buffs[p] = buff;
-      buff += recv->comm->buffSizes[p];
+      buff += comm->buffSizes[p];
     }
   }
   return ncclSuccess;
 }
 
 ncclResult_t p2pSendFree(struct ncclConnector* send) {
-  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+  struct p2pResources* resources = (struct p2pResources*)send->transportResources;
   if (resources) {
-    if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
-    if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+    if (ncclCuMemEnable()) {
+      // cuMem API support
+      if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
+      if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+    }
+    else {
+      if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+      if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+    }
     free(resources);
   }
   return ncclSuccess;
 }
 
 ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
-  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+  struct p2pResources* resources = (struct p2pResources*)recv->transportResources;
   if (resources) {
-    if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
-    if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
-    if (useMemcpy) {
-      NCCLCHECK(ncclShmClose(resources->handle));
+    if (ncclCuMemEnable()) {
+      // cuMem API support
+      if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
+      if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+    }
+    else {
+      if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+      if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+      if (useMemcpy) {
+        NCCLCHECK(ncclShmClose(resources->handle));
+      }
     }
     free(resources);
   }
   return ncclSuccess;
 }
 
-static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (useMemcpy) {
-    struct p2pProxyInfo* proxyInfo;
+    // CE memcpy support
+    struct p2pShmProxyInfo* proxyInfo;
     NCCLCHECK(ncclCalloc(&proxyInfo, 1));
     connection->transportResources = proxyInfo;
 
-    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream, true));
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true));
 
     char shmPath[PATH_MAX];
     shmPath[0] = '\0';
     proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
+    // Create a SHM segment for the peer to attach to
     NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
     TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
     memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
 
     NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
 
-    if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
-    memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
+    if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
+    memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
   } else {
     if (reqSize != sizeof(int)) return ncclInternalError;
     int size = *((int*)reqBuff);
     if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
     struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
-    NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true));
-    connection->transportResources = p2pBuff->directPtr;
-    cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
-    if (res != cudaSuccess) {
-      WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
-      cudaFree(p2pBuff->directPtr);
-      free(p2pBuff);
-      CUDACHECK(res);
+    NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
+    p2pBuff->size = size;
+    if (ncclCuMemEnable()) {
+      // cuMem API support
+      struct p2pCuMemProxyInfo* proxyInfo;
+      NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+      memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
+      connection->transportResources = proxyInfo;
+    } else {
+      connection->transportResources = p2pBuff->directPtr;
     }
   }
   *done = 1;
   return ncclSuccess;
 }
 
-static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(int)) return ncclInternalError;
   int size = *((int*)reqBuff);
   if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
   struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
-  NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, comm->sideStream, true));
-  connection->transportResources = p2pBuff->directPtr;
-  cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
-  if (res != cudaSuccess) {
-    WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
-    cudaFree(p2pBuff->directPtr);
-    free(p2pBuff);
-    CUDACHECK(res);
+  NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
+  p2pBuff->size = size;
+  if (ncclCuMemEnable()) {
+    // cuMem API support
+    struct p2pCuMemProxyInfo* proxyInfo;
+    NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+    memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff));
+    connection->transportResources = proxyInfo;
+  } else {
+    connection->transportResources = p2pBuff->directPtr;
   }
   *done = 1;
   return ncclSuccess;
 }
 
-static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
 
   if (reqSize != sizeof(void*)) return ncclInternalError;
   proxyInfo->recvFifo = *((char**)reqBuff);
@@ -501,13 +655,14 @@ static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection,
   return ncclSuccess;
 }
 
-static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
+  // CE memcpy support
   if (useMemcpy) {
-    struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+    struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
     if (proxyInfo) {
       NCCLCHECK(ncclShmClose(proxyInfo->handle));
       NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
-      CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
+      NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
       CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
       for (int i=0; i<NCCL_STEPS; i++) {
         CUDACHECK(cudaEventDestroy(proxyInfo->events[i]));
@@ -515,23 +670,45 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
       free(proxyInfo);
     }
   } else {
-    // Do not check return code as CUDA may have already shut down
-    cudaFree(connection->transportResources);
+    if (ncclCuMemEnable()) {
+      // cuMem API support
+      struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
+      if (proxyInfo) {
+        struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
+        ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
+        ncclCudaFree(p2pBuff->directPtr);
+        free(proxyInfo);
+      }
+    } else {
+      // Do not check return code as CUDA may have already shut down
+      ncclCudaFree(connection->transportResources);
+    }
   }
   return ncclSuccess;
 }
 
-static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
-  // Do not check return code as CUDA may have already shut down
-  cudaFree(connection->transportResources);
+static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
+  if (ncclCuMemEnable()) {
+    struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources;
+    if (proxyInfo) {
+      struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff;
+      ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc);
+      ncclCudaFree(p2pBuff->directPtr);
+      free(proxyInfo);
+    }
+  } else {
+    // Do not check return code as CUDA may have already shut down
+    ncclCudaFree(connection->transportResources);
+  }
   return ncclSuccess;
 }
 
-static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+// CE memcpy support
+static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->transmitted = sub->done = 0;
@@ -541,10 +718,10 @@ static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxy
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
-    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources);
       if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
           resources->step = sub->base + sub->nsteps;
           args->done++;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index e125df2c2f..dbabf66725 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -85,7 +85,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   shmPath[0] = '\0';
   int shmSize = sizeof(struct ncclSendMem);
   if (shmLocality == SHM_SEND_SIDE) {
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
   }
   info->shmSize = resources->shmSize = shmSize;
   NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@@ -108,7 +108,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   shmPath[0] = '\0';
   int shmSize = sizeof(struct ncclRecvMem);
   if (shmLocality == SHM_RECV_SIDE) {
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
   }
   info->shmSize = resources->shmSize = shmSize;
   NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
@@ -146,7 +146,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
   char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     send->conn.buffs[p] = buff;
-    buff += send->comm->buffSizes[p];
+    buff += comm->buffSizes[p];
   }
   send->conn.tail = &resources->devRemHostMem->tail;
   send->conn.head = &resources->devHostMem->head;
@@ -155,9 +155,11 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
     send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
   }
   if (useMemcpySend) {
-    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
+    int tpProxyRank;
+    tpProxyRank = comm->topParentRanks[comm->rank];
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
     struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
-    NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
     send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
     send->conn.tail = &proxyInfo.ceRecvMem->tail;
     send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@@ -179,7 +181,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     recv->conn.buffs[p] = buff;
-    buff += recv->comm->buffSizes[p];
+    buff += comm->buffSizes[p];
   }
   recv->conn.head = &resources->devRemHostMem->head;
   recv->conn.tail = &resources->devHostMem->tail;
@@ -187,7 +189,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   if (useMemcpyRecv) {
     NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
     struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
-    NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
     recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
     recv->conn.tail = &proxyInfo.ceRecvMem->tail;
   }
@@ -214,12 +216,12 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
   return ncclSuccess;
 }
 
-static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct shmProxyInfo* proxyInfo;
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
   if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   memcpy(proxyInfo, reqBuff, reqSize);
-  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream));
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
   NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
   CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
   for (int i=0; i<NCCL_STEPS; i++) {
@@ -232,12 +234,12 @@ static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection,
   return ncclSuccess;
 }
 
-static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct shmProxyInfo* proxyInfo;
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
   if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   memcpy(proxyInfo, reqBuff, reqSize);
-  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE], comm->sideStream));
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
   NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
   CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
   for (int i=0; i<NCCL_STEPS; i++) {
@@ -250,12 +252,12 @@ static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection,
   return ncclSuccess;
 }
 
-static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
 
   if (resources) {
     CUDACHECK(cudaStreamDestroy(resources->stream));
-    CUDACHECK(cudaFree(resources->devFifo));
+    NCCLCHECK(ncclCudaFree(resources->devFifo));
     NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
     for (int i=0; i<NCCL_STEPS; i++) {
       CUDACHECK(cudaEventDestroy(resources->events[i]));
@@ -265,12 +267,12 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str
   return ncclSuccess;
 }
 
-static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
 
   if (resources) {
     CUDACHECK(cudaStreamDestroy(resources->stream));
-    CUDACHECK(cudaFree(resources->devFifo));
+    NCCLCHECK(ncclCudaFree(resources->devFifo));
     NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
     for (int i=0; i<NCCL_STEPS; i++) {
       CUDACHECK(cudaEventDestroy(resources->events[i]));
@@ -280,7 +282,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
   return ncclSuccess;
 }
 
-static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -294,7 +296,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
-    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
       struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
@@ -339,7 +341,7 @@ static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxy
   return ncclSuccess;
 }
 
-static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -353,7 +355,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxy
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
-    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    int stepSize = proxyState->buffSizes[p] / NCCL_STEPS;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
       struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp
index a45f8bdedb..54356d0f0f 100644
--- a/test/common/TestBedChild.cpp
+++ b/test/common/TestBedChild.cpp
@@ -196,7 +196,7 @@ namespace RcclUnitTesting
 
       if (useMultiRankPerGpu)
       {
-        if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
+        //if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
         {
           ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId);
           status = TEST_FAIL;
diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile
index 4a51a427ce..7446d7b52a 100644
--- a/tools/topo_expl/Makefile
+++ b/tools/topo_expl/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC = $(HIP_PATH)/bin/hipcc
 
 EXE = topo_expl
-CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
+CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
 
 files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
 	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
diff --git a/tools/topo_expl/include/model.h b/tools/topo_expl/include/model.h
index 13e66fbaf0..3b02c55ea2 100644
--- a/tools/topo_expl/include/model.h
+++ b/tools/topo_expl/include/model.h
@@ -69,7 +69,7 @@ public:
 
   int rankToCudaDev(int rank) {
     for (int i=0; i<getNumGpus(); i++) {
-      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank[0])
+      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
         return systems[0]->nodes[GPU].nodes[i].gpu.dev;
     }
     return -1;
@@ -77,7 +77,7 @@ public:
 
   int64_t getGpuBusId(int rank) {
     for (int i=0; i<getNumGpus(); i++) {
-      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank[0])
+      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
         return systems[0]->nodes[GPU].nodes[i].id;
     }
     return -1;
@@ -93,7 +93,7 @@ public:
   void setRanks() {
     for (int r=0; r<getNumGpus(); r++)
       for (int i=0; i<getNumGpus(); i++)
-        systems[r]->nodes[GPU].nodes[i].gpu.rank[0] += firstRank;
+        systems[r]->nodes[GPU].nodes[i].gpu.rank += firstRank;
   }
 
   int p2pCanConnect(int device1, int device2) { return 1; }
@@ -133,4 +133,4 @@ public:
   NetworkModel() : nRanks(0) {}
 };
 
-#endif
+#endif
\ No newline at end of file
diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h
index aab3a4487f..27737e2231 100644
--- a/tools/topo_expl/include/nccl.h
+++ b/tools/topo_expl/include/nccl.h
@@ -1,6 +1,7 @@
 /*************************************************************************
  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,24 +13,25 @@
 #include <hip/hip_fp16.h>
 
 #define NCCL_MAJOR 2
-#define NCCL_MINOR 14
-#define NCCL_PATCH 3
+#define NCCL_MINOR 18
+#define NCCL_PATCH 1
 #define NCCL_SUFFIX ""
 
-#define NCCL_VERSION_CODE 21403
+#define NCCL_VERSION_CODE 21801
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #define RCCL_BFLOAT16 1
 #define RCCL_GATHER_SCATTER 1
 #define RCCL_ALLTOALLV 1
-#define RCCL_MULTIRANKPERGPU 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*! @brief Opaque handle to communicator */
+#include <limits.h>
 typedef struct ncclComm* ncclComm_t;
+#define NCCL_COMM_NULL NULL
 
 #define NCCL_UNIQUE_ID_BYTES 128
 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
@@ -45,15 +47,24 @@ typedef enum { ncclSuccess                 =  0,
                ncclInProgress              =  7,
                ncclNumResults              =  8 } ncclResult_t;
 
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
-typedef struct ncclConfig_v21400 {
+typedef struct ncclConfig_v21700 {
   /* attributes that users should never touch. */
   size_t size;
   unsigned int magic;
   unsigned int version;
   /* attributes that users are able to customize. */
   int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -62,7 +73,12 @@ typedef struct ncclConfig_v21400 {
   sizeof(ncclConfig_t), /* size */                                      \
   0xcafebeef,           /* magic */                                     \
   NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  1                     /* blocking */                                  \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }
 
 /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
@@ -117,28 +133,6 @@ ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 /// @endcond
 
-/*! @brief Creates a new communicator (multi thread/process version) allowing multiple ranks per device.
-
-    @details
-    rank must be between 0 and nranks-1 and unique within a communicator clique.
-    Each rank is associated to a HIP device, which has to be set before calling
-    ncclCommInitRankMulti.
-    Since this version of the function allows multiple ranks to utilize the same
-    HIP device, a unique virtualId per device has to be provided by each calling
-    rank.
-    ncclCommInitRankMulti implicitly syncronizes with other ranks, so it must be
-    called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
-
-    @param[in]
-    comm        ncclComm_t*
-                communicator struct pointer
-    */
-  ncclResult_t  ncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
-/// @cond include_hidden
-  ncclResult_t pncclCommInitRankMulti(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, int virtualId);
-/// @endcond
-
-
 /*! @brief Creates a clique of communicators (single process version).
  *
  * @details This is a convenience function to create a single-process communicator clique.
@@ -177,6 +171,19 @@ ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 /// @endcond
 
+/*! @brief Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/// @cond include_hidden
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/// @endcond
+
+/* Returns a string for each error code. */
 /*! @brief Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 /// @cond include_hidden
@@ -188,7 +195,7 @@ const char* pncclGetErrorString(ncclResult_t result);
  */
 const char*  ncclGetLastError(ncclComm_t comm);
 /// @cond include_hidden
-const char* pncclGetError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
 /// @endcond
 
 /* Checks whether the comm has encountered any asynchronous errors */
@@ -498,6 +505,44 @@ ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
     const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 /// @endcond
 
+/*! @brief Opaque handle to MSCCL algorithm */
+typedef int mscclAlgoHandle_t;
+
+/*! @brief MSCCL Load Algorithm
+ *
+ * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
+ * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
+ * scheduler instead of end users.
+ */
+ncclResult_t  mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+
+/*! @brief MSCCL Run Algorithm
+ *
+ * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
+ * list merges all possible parameters required by different operations as this
+ * is a general-purposed API. This API is expected to be called by MSCCL
+ * scheduler instead of end users.
+ */
+ncclResult_t  mscclRunAlgo(
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
+ncclResult_t pmscclRunAlgo(
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
+
+/*! @brief MSCCL Load Algorithm
+ *
+ * @details Unload MSCCL algorithm previous loaded using its handle. This API
+ * is expected to be called by MSCCL scheduler instead of end users.
+ */
+ncclResult_t  mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
+ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
+
 /*
  * Group semantics
  *
diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h
index 8ce2172727..2554380d80 100644
--- a/tools/topo_expl/include/utils.h
+++ b/tools/topo_expl/include/utils.h
@@ -8,8 +8,7 @@
 #ifndef UTILS_H_
 #define UTILS_H_
 
-// AllGather3 - begin
-struct ncclGraphInfo {
+struct graphInfo {
   int pattern;
   int nChannels;
   int sameChannels;
@@ -19,14 +18,10 @@ struct ncclGraphInfo {
   int typeInter;
 };
 
-struct allGather3Data_t{
-  int netDev;
-  int collNetSupport;
-  int nc;
-  struct ncclGraphInfo tree;
-  struct ncclGraphInfo ring;
-  struct ncclGraphInfo collNet;
+struct allGatherInfo {
+  struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
   struct ncclTopoRanks topoRanks;
+  int nc;
   bool pivotA2AEnabled;
   bool ll128Enabled;
   bool mscclEnabled;
@@ -40,11 +35,11 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
 
 ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash);
 
-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
-  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent = NULL);
 
-ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
-  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
+ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph);
 
 #define TIME_START(index)
 
diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp
index b677de49b3..f1d063ce63 100644
--- a/tools/topo_expl/model.cpp
+++ b/tools/topo_expl/model.cpp
@@ -216,10 +216,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
         req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = proxyRank;
@@ -242,7 +242,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
 
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, comm->ncclNet->name, req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp
index bbb5eaff56..ea636a5492 100644
--- a/tools/topo_expl/topo_expl.cpp
+++ b/tools/topo_expl/topo_expl.cpp
@@ -153,10 +153,15 @@ NodeModelDesc model_descs[] = {
   {2, "topo_8p1h_5.xml",        "2 nodes 8P1H Alt."},
 };
 
+NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);
+NCCL_PARAM(MinCTAs, "MIN_CTAS", 1);
+
 int main(int argc,char* argv[])
 {
   struct ncclComm *comm;
   const int num_models = sizeof(model_descs) / sizeof(*model_descs);
+  int minCTAsEnv;
+  int maxCTAsEnv;
 
   if (!cmdOptionExists(argv, argv + argc, "-m")) {
     printf("Usage: ./topo_expl -m model_id\n");
@@ -200,18 +205,22 @@ int main(int argc,char* argv[])
       node_model->rankToCudaDev(i), node_model->getGpuBusId(i));
   }
 
+  minCTAsEnv = ncclParamMinCTAs();
+  maxCTAsEnv = ncclParamMaxCTAs();
+
   NCCLCHECK(ncclCalloc(&comm, nranks));
 
   struct ncclPeerInfo *peerInfo;
   NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root
 
-  struct allGather3Data_t *allGather3Data;
+  struct allGatherInfo* allGather3Data;
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
 
-  struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
+  struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph, *nvlsGraph;
   NCCLCHECK(ncclCalloc(&treeGraph, nranks));
   NCCLCHECK(ncclCalloc(&ringGraph, nranks));
   NCCLCHECK(ncclCalloc(&collNetGraph, nranks));
+  NCCLCHECK(ncclCalloc(&nvlsGraph, nranks));
 
   for (int i = 0; i < nranks; i++) {
     comm[i].rank = i;
@@ -224,8 +233,23 @@ int main(int argc,char* argv[])
     comm[i].topo = node_model->getSystem(i);
     comm[i].peerInfo = peerInfo;
     comm[i].ncclNet = ncclNet;
-    comm[i].virtualId = -1;
-    // Mark channels as non initialized.
+    comm[i].config.maxCTAs = maxCTAsEnv;
+    comm[i].config.minCTAs = minCTAsEnv;
+    if (comm[i].topParentRanks == NULL) {
+      NCCLCHECK(ncclCalloc(&comm[i].topParentRanks, comm->nRanks));
+      for (int j = 0; j < comm->nRanks; ++j)
+        comm[i].topParentRanks[j] = j;
+    }
+    struct ncclSharedResources* sharedRes = NULL;
+    NCCLCHECK(ncclCalloc(&sharedRes, 1));
+    /* most of attributes are assigned later in initTransportsRank(). */
+    sharedRes->owner = &comm[i];
+    sharedRes->tpNRanks = comm[i].nRanks;
+    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm[i].nRanks));
+    comm[i].sharedRes = sharedRes;
+    sharedRes->refCount = 1;
+    ncclMemoryStackConstruct(&comm[i].memPermanent);
+   // Mark channels as non initialized.
     for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
     NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
   }
@@ -233,13 +257,13 @@ int main(int argc,char* argv[])
   for (int i = 0; i < nranks; i++) {
     node_model = network.GetNode(i);
     assert(node_model!=0);
-    initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
+    initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
   }
 
   for (int i = 0; i < nranks; i++) {
     node_model = network.GetNode(i);
     assert(node_model!=0);
-    initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
+    initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
   }
 
   for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index 46e4f3a568..687b515895 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <cstdarg>
 #include "xml.h"
 #include "coll_net.h"
 #include "model.h"
@@ -37,8 +38,15 @@ const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
 extern NodeModel *node_model;
 
-NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+RCCL_PARAM(CliqueIgnoreTopo, "CLIQUE_IGNORE_TOPO", 0);
+RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
+RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 1);
+RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0);
+
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
+NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
+NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0);
+NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
 
 thread_local int ncclDebugNoWarn = 0;
 ncclCollNet_t* ncclCollNet = NULL;
@@ -74,6 +82,103 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
   return ncclSuccess;
 }
 
+void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
+  // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
+  // this (reachable via `->above`) are empty.
+  struct Hunk* top = me->topFrame.hunk;
+  size_t mallocSize = 0;
+
+  // If we have lots of space left in hunk but that wasn't enough then we'll
+  // allocate the object unhunked.
+  if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
+    goto unhunked;
+
+  // If we have another hunk (which must be empty) waiting above this one and
+  // the object fits then use that.
+  if (top && top->above) {
+    struct Hunk* top1 = top->above;
+    uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
+    if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
+      me->topFrame.hunk = top1;
+      me->topFrame.bumper = uobj + size;
+      me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+  { // If the next hunk we're going to allocate wouldn't be big enough but the
+    // Unhunk proxy fits in the current hunk then go allocate as unhunked.
+    size_t nextSize = (top ? top->size : 0) + (64<<10);
+    constexpr size_t maxAlign = 64;
+    if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
+      uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+      if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
+        goto unhunked;
+    }
+
+    // At this point we must need another hunk, either to fit the object
+    // itself or its Unhunk proxy.
+    mallocSize = nextSize;
+    INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
+    if (top1 == nullptr) goto malloc_exhausted;
+    top1->size = nextSize;
+    top1->above = nullptr;
+    if (top) top->above = top1;
+    top = top1;
+    me->topFrame.hunk = top;
+    me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
+    me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
+  }
+
+  { // Try to fit object in the new top hunk.
+    uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+    if (uobj + size <= me->topFrame.end) {
+      me->topFrame.bumper = uobj + size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+unhunked:
+  { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
+    // to keep track of it.
+    uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+    Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
+    me->topFrame.bumper = uproxy + sizeof(Unhunk);
+    proxy->next = me->topFrame.unhunks;
+    me->topFrame.unhunks = proxy;
+    mallocSize = size;
+    proxy->obj = malloc(mallocSize);
+    INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    if (proxy->obj == nullptr) goto malloc_exhausted;
+    return proxy->obj;
+  }
+
+malloc_exhausted:
+  WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
+  abort();
+}
+
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
+  // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
+  struct ncclMemoryStack::Frame* f = &me->topFrame;
+  while (f != nullptr) {
+    struct ncclMemoryStack::Unhunk* u = f->unhunks;
+    while (u != nullptr) {
+      free(u->obj);
+      u = u->next;
+    }
+    f = f->below;
+  }
+  // Free hunks
+  struct ncclMemoryStack::Hunk* h = me->stub.above;
+  while (h != nullptr) {
+    struct ncclMemoryStack::Hunk *h1 = h->above;
+    free(h);
+    h = h1;
+  }
+}
+
 int ncclDebugLevel = -1;
 
 void ncclDebugInit() {
@@ -126,43 +231,60 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem**
   return ncclSuccess;
 }
 
+NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
 
 void initCollNet() {
   if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0)
     ncclCollNet = (ncclCollNet_t*)0x12345678;
 }
 
-ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
-  struct ncclChannel* channel = comm->channels+channelid;
+ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
   if (channel->id != -1) return ncclSuccess;
-  channel->id = channelid;
 
-  // Ring index to user rank table.
-  //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+  int nRanks = comm->nRanks;
+  int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
+  channel->id = channelId;
+  channel->workFifoSent = 0;
 
-  // Communication structures with peers.
-  //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
-  for (size_t i=0; i<comm->nRanks+1; ++i) {
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      channel->peers[i].send[b].comm = comm;
-      channel->peers[i].recv[b].comm = comm;
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+
+  //NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+
+  if (channel->peers == NULL) {
+    // The extra on nRanks+1 is for collnet root (i.e. network)
+    // Allocate everything related to sharedRes with ncclCalloc as this can be
+    // shared between communicators hence should not be tied to comm.
+    if (sharedRes->peers[channelId] == NULL) {
+      NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
+    }
+    channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
+    for (int r = 0; r < nRanks; r++) {
+      channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
+      ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
     }
   }
+#if 0
+  if (channel->devPeers == NULL) {
+    if (sharedRes->devPeers[channelId] == NULL) {
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+    }
+    /* channel->devPeers is not shared, so just free it when calling commFree() */
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    ncclCommPushCudaFree(comm, channel->devPeers);
+    for (int r = 0; r < nRanks; r++) {
+      uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    }
+  }
+#endif
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  //NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  //ncclCommPushCudaFree(comm, channel->devRingUserRanks);
 
-  // Per-channel operation list.
-  //NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
-  //if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
-    // GDRCOPY support
-    // We allocate a workFifo in GDR mapped CUDA memory
-    // But we still allocate the Host workFifo so that we
-    // can copy the work elements to CUDA memory on kernel launch
-    //NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
-  //} else {
-    // The device workFifo is the Host one
-    //channel->workFifoDev = channel->workFifo;
-  //}
+  //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+  //CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream));
+  //CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0));
 
   return ncclSuccess;
 }
@@ -213,8 +335,8 @@ template <int type>
 static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
   struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
   struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
-  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
-                                                  comm->channels[channelId].peers[peer].recv + connIndex;
+  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex :
+                                                  comm->channels[channelId].peers[peer]->recv + connIndex;
   // handle intra-node network connections
   int n1 = -1, n2 = -1;
   if (connIndex == NCCL_CONN_IDX_P2P_NET) {
@@ -248,12 +370,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int n
   uint64_t mask = 1UL << channel->id;
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
-    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue;
     comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
   }
   for (int i=0; i<nsend; i++) {
     int peer = peerSend[i];
-    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
+    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue;
     comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
   }
   return ncclSuccess;
@@ -272,9 +394,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   ncclResult_t ret = ncclSuccess;
   int highestType = TRANSPORT_P2P;  // track highest transport type
-  struct ncclConnect data[2*MAXCHANNELS];
+  struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect
+  struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
+  struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
 
-  //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
+  //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+  // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
     int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
     int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@@ -282,22 +407,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
     uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
 
-    struct ncclConnect* recvData = data;
+    // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
+    // This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
+    // The first N entries contain recvData, connection information for recv connections
+    // The next M entries contain sendData, connection information for send connections
+    // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
+    data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
+    recvData[i] = data[i];
     int sendChannels = 0, recvChannels = 0;
     int type;
     TIME_START(0);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1UL<<c)) {
-        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
+        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
         if (type > highestType) highestType = type;
       }
     }
     TIME_STOP(0);
     TIME_START(1);
-    struct ncclConnect* sendData = recvData+recvChannels;
+    sendData[i] = recvData[i]+recvChannels;
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1UL<<c)) {
-        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
+        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
         if (type > highestType) highestType = type;
       }
     }
@@ -306,48 +437,93 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     TIME_START(2);
     if (sendPeer == recvPeer) {
       if (recvChannels+sendChannels) {
-         //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
-         //NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
-         sendData = data;
-         recvData = data+sendChannels;
+        //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+        //NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+        sendData[i] = data[i];
+        recvData[i] = data[i]+sendChannels;
       }
     } else {
-      //if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
-      //if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
-      //if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
-      //if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      //if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      //if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      //if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      //if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
     }
     TIME_STOP(2);
-
-    TIME_START(3);
-    for (int c=0; c<MAXCHANNELS; c++) {
-      if (sendMask & (1UL<<c)) {
-        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
-        //NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
-        conn->connected = 1;
-        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
-        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
-      }
-    }
-    TIME_STOP(3);
-    TIME_START(4);
-    for (int c=0; c<MAXCHANNELS; c++) {
-      if (recvMask & (1UL<<c)) {
-        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
-        //NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
-        conn->connected = 1;
-        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
-      }
-    }
-    TIME_STOP(4);
-    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
   }
 
+  // Loop until all channels with all ranks have been connected
+  bool allChannelsConnected;
+  allChannelsConnected = false;
+  while (!allChannelsConnected) {
+    allChannelsConnected = true;
+    for (int i=1; i<comm->nRanks; i++) {
+      int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+      int sendPeer = (comm->rank + i) % comm->nRanks;
+      uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+      uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+
+      int sendDataOffset = 0;
+      int recvDataOffset = 0;
+      for (int c=0; c<MAXCHANNELS; c++) {
+          TIME_START(3);
+          if (sendMask & (1UL<<c)) {
+            struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
+            // This connector hasn't completed connection yet
+            if (conn->connected == 0) {
+              //NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
+              if (ret == ncclSuccess) {
+                struct ncclDevChannelPeer* addr;
+                conn->connected = 1;
+                /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
+                //CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                //CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+              } else if (ret == ncclInProgress) {
+                allChannelsConnected = false;
+              }
+            }
+          }
+          TIME_STOP(3);
+
+          // Start with recv channels
+          TIME_START(4);
+          if (recvMask & (1UL<<c)) {
+            struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
+            // This connector hasn't completed connection yet
+            if (conn->connected == 0) {
+              //NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
+              if (ret == ncclSuccess) {
+                struct ncclDevChannelPeer* addr;
+                conn->connected = 1;
+                /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
+                //CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                //CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+              } else if (ret == ncclInProgress) {
+                allChannelsConnected = false;
+              }
+            }
+          }
+          TIME_STOP(4);
+      }
+    }
+  }
+
+  // Clear all connect masks and free each connectInfo array
+  for (int i=1; i<comm->nRanks; i++) {
+    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+    int sendPeer = (comm->rank + i) % comm->nRanks;
+    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
+    free(data[i]);
+  }
+
+  free(data);
+  free(sendData);
+  free(recvData);
+
   if (highestTransportType != NULL) *highestTransportType = highestType;
   TIME_PRINT("P2P Setup/Connect");
 exit:
-  //NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
-  //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
+  //NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
+  //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
   return ret;
 fail:
   goto exit;
@@ -381,7 +557,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   }
 
   // select
-  struct ncclChannelPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers[nranks];
   // connector index: 0 for recv, 1 for send
   struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
   struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
@@ -419,10 +595,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   }
   // connect
   if (isMaster) {
-    //NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
-    //CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
+    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+    struct ncclDevChannelPeer* devRoot;
+    //CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
+    //CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
   }
   // recv side sends connect info to send side
   if (isMaster && type == collNetRecv) {
@@ -460,157 +637,60 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
   // Free collNet resources
   for (int r=0; r<comm->nChannels; r++) {
     struct ncclChannel* channel = comm->channels+r;
-    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      struct ncclConnector* send = peer->send + b;
-      //if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
-      send->transportResources = NULL; // avoid double free
-    }
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      struct ncclConnector* recv = peer->recv + b;
-      //if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
-      recv->transportResources = NULL; // avoid double free
+    struct ncclChannelPeer* peer = channel->peers[comm->nRanks];
+    if (peer) {
+      if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) {
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          struct ncclConnector* send = peer->send + b;
+          if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
+          send->transportResources = NULL; // avoid double free
+        }
+        for (int b=0; b<NCCL_MAX_CONNS; b++) {
+          struct ncclConnector* recv = peer->recv + b;
+          if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
+          recv->transportResources = NULL; // avoid double free
+        }
+      }
     }
   }
   return ncclSuccess;
 }
 
-RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
-NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
-RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
-NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
-RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0);
-
-static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) {
-  ncclResult_t ret = ncclSuccess;
-  int* heads = NULL;
-  int rank = comm->rank;
-  int collNetSetupFail = 0;
-  int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
-  // Find all head ranks
-  int nHeads = collNetGraph->nChannels;
-  int highestTransportType0, highestTransportType1;
-  char line[1024];
-
-  NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
-  // Head GPU index is always 0
-  for (int c = 0; c < nHeads; c++) {
-    heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
-  }
-
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    for (int h = 0; h < nHeads; h++) {
-      const int head = heads[h];
-      collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
-      if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
-    }
-    // Verify CollNet setup across ranks after trying the first channel
-    if (c == 0) {
-      NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
-    }
-  }
-  // Verify CollNet setup across ranks after trying all channels
-  NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
-  TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
-
-  line[0] = '\0';
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclTree* chain = &comm->channels[c].collnetChain;
-    snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d",
-      c, chain->down[0], rank, chain->up);
-  }
-  line[1023] = '\0';
-
-  INFO(NCCL_INIT, "Collnet Chains %s", line);
-  // Connect Collnet + chain
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail);
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail);
-  INFO(NCCL_INIT, "Connected collnet + chain");
-
-  // Connect intra-node CollNet + Direct
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channelRecv = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail);
-
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channelSend = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail);
-
-#if 0
-  // Exchange highest intra-node transport type among ranks
-  // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-  comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
-  for (int i = 0; i < comm->localRanks; i++) {
-    if (highestTypes[i] > comm->intraHighestTransportType)
-      comm->intraHighestTransportType = highestTypes[i];
-  }
-#endif
-  INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
-
-exit:
-  free(heads);
-  return ret;
-fail:
-  ncclTransportCollNetFree(comm);
-  comm->collNetSupport = 0;
-  goto exit;
-}
-
-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
-  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph, struct ncclComm* parent) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
   // 2. { nChannels, graphInfo, topoRanks }
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
   cpu_set_t affinitySave;
-  //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  //NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail);
+  //struct ncclTopoGraph ringGraph;
+  //struct ncclTopoGraph treeGraph;
+  //struct ncclTopoGraph collNetGraph;
+  //struct ncclTopoGraph nvlsGraph;
+  struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
+
+  int nChannelsOrig;
+  struct ncclTopoRanks** allTopoRanks = NULL;
+  int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+  int *rings = NULL;
+  int* nvbPeers = NULL;
+  struct ncclProxyConnector proxyConn;
+  int* pxnPeers = NULL;
+  int *topParentLocalRanks = NULL;
+  int tpProxyRank;
 
   // AllGather1 - begin
   //NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
   //NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail);
   //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
 
-  //If virtualId == -1 multiRank support has not been requested by user, using original interface
-  if (comm->virtualId == -1) {
-    for (int i = 0; i < nranks; i++) {
-      if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
-        WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
-        ret = ncclInvalidUsage;
-        goto fail;
-      }
-    }
-  }
-  else {
-    //Multiple ranks can use the same device, but need to have different virtualId's.
-    for (int i = 0; i < nranks; i++) {
-      for (int j=0; j < nranks; j++) {
-      	if (j==i) continue;
-      	if((comm->peerInfo[i].hostHash  == comm->peerInfo[j].hostHash)  &&
-      	   (comm->peerInfo[i].busId     == comm->peerInfo[j].busId)     &&
-      	   (comm->peerInfo[i].virtualId == comm->peerInfo[j].virtualId)) {
-      	  WARN("Duplicate virtualId detected : rank %d and rank %d both on GPU device %lx virtualId %d",
-      	       i, j, comm->peerInfo[rank].busId, comm->peerInfo[i].virtualId);
-      	  return ncclInvalidUsage;
-      	}
-      }
+  for (int i = 0; i < nranks; i++) {
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+      ret = ncclInvalidUsage;
+      goto fail;
     }
   }
   // AllGather1 - end
@@ -618,6 +698,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
   do {
     // Compute intra-process ranks
     int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
+    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
     for (int i = 0; i < nranks; i++) {
       if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
           && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
@@ -682,8 +764,19 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
   //  sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
   //}
 
-  // Launch proxy service thread
-  //NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  // Determine local CollNet support
+  if (collNetSupport(comm)) {
+    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    if (collNetEnable != NULL) {
+      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
+      if (strcmp(collNetEnable, "1") == 0) {
+        comm->collNetSupport = 1;
+      }
+    }
+  }
+
+  // Determine local Nvls support
+  //NCCLCHECK(ncclNvlsInit(comm));
 
   // Get rings and trees
   ringGraph.id = 0;
@@ -706,8 +799,24 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
   collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
-  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
+  if (comm->collNetSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
+  } else {
+    collNetGraph.nChannels = 0;
+  }
+
+  nvlsGraph.id = 3;
+  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
+  nvlsGraph.collNet = 0;
+  nvlsGraph.minChannels = 1;
+  nvlsGraph.maxChannels = MAXCHANNELS;
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
+  } else {
+    nvlsGraph.nChannels = 0;
+  }
 
   bool allXgmi, hasPeerAccess;
   allXgmi = true;
@@ -735,22 +844,10 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
   comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
 
   if (comm->rank == ncclParamGraphDumpFileRank()) {
-    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
-    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail);
+    struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph };
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail);
   }
 
-  // Determine local CollNet support before all-gather
-  if (collNetSupport(comm)) {
-    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
-    if (collNetEnable != NULL) {
-      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
-      if (strcmp(collNetEnable, "1") == 0) {
-        comm->collNetSupport = 1;
-      }
-    }
-  }
-  if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;
-
   if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
     if (rcclParamP2pNetDisable() == 0) {
       if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1;
@@ -764,64 +861,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
   int idx;
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
-  if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-	(comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
     allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
-  if ( ((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-	(comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        (comm->topo->type & RCCL_TOPO_CR8G))
     allGather3Data[rank].nc = 4;
-  if (((comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->virtualId == -1)  ||
-       (comm->topo->nodes[GPU].count <= comm->topo->nRanks && comm->virtualId != -1)) &&
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
-  NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail);
-  allGather3Data[rank].tree.pattern = treeGraph.pattern;
-  allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
-  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
-  allGather3Data[rank].tree.bwIntra = treeGraph.bwIntra;
-  allGather3Data[rank].tree.bwInter = treeGraph.bwInter;
-  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
-  allGather3Data[rank].tree.typeInter = treeGraph.typeInter;
-  allGather3Data[rank].ring.pattern = ringGraph.pattern;
-  allGather3Data[rank].ring.nChannels = ringGraph.nChannels;
-  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
-  allGather3Data[rank].ring.bwIntra = ringGraph.bwIntra;
-  allGather3Data[rank].ring.bwInter = ringGraph.bwInter;
-  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
-  allGather3Data[rank].ring.typeInter = ringGraph.typeInter;
-  allGather3Data[rank].collNet.pattern = collNetGraph.pattern;
-  allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels;
-  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
-  allGather3Data[rank].collNet.bwIntra = collNetGraph.bwIntra;
-  allGather3Data[rank].collNet.bwInter = collNetGraph.bwInter;
-  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
-  allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
-  allGather3Data[rank].collNetSupport = comm->collNetSupport;
   allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
   comm->topo->ll128Enabled =  comm->topo->ll128Enabled || rcclParamLL128ForceEnable();
   allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled;
   allGather3Data[rank].mscclEnabled = comm->topo->mscclEnabled;
 
-  comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
-    ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
-  NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail);
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+    allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+    allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+    allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+    allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+    allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+  }
+
+  comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
 fail:
   return ret;
 }
 
-ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
-  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
+ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph, struct ncclTopoGraph& nvlsGraph) {
+  ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  ncclResult_t ret;
+  cpu_set_t affinitySave;
+
+  struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
+
   int nChannelsOrig;
   struct ncclTopoRanks** allTopoRanks = NULL;
   int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
@@ -829,6 +913,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   int* nvbPeers = NULL;
   struct ncclProxyConnector proxyConn;
   int* pxnPeers = NULL;
+  int *topParentLocalRanks = NULL;
+  int tpProxyRank;
 
   //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
 
@@ -844,7 +930,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
       comm->nNodes++;
       nodesFirstRank[node] = firstRank;
       // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
+      nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
     }
     comm->rankToNode[r] = node;
   }
@@ -887,32 +973,22 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   int nc;
   nc = allGather3Data[0].nc;
   for (int i=0; i<nranks; i++) {
-    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     nc = std::min(allGather3Data[i].nc, nc);
     // Make sure we align all ranks so that the tuning is consistent across ranks
-    treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
-    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
-    treeGraph.bwIntra = std::min(allGather3Data[i].tree.bwIntra, treeGraph.bwIntra);
-    treeGraph.bwInter = std::min(allGather3Data[i].tree.bwInter, treeGraph.bwInter);
-    treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
-    treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
-    ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
-    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
-    ringGraph.bwIntra = std::min(allGather3Data[i].ring.bwIntra, ringGraph.bwIntra);
-    ringGraph.bwInter = std::min(allGather3Data[i].ring.bwInter, ringGraph.bwInter);
-    ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
-    ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
-    collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
-    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
-    collNetGraph.bwIntra = std::min(allGather3Data[i].collNet.bwIntra, collNetGraph.bwIntra);
-    collNetGraph.bwInter = std::min(allGather3Data[i].collNet.bwInter, collNetGraph.bwInter);
-    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
-    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
-    comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
     comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
     comm->topo->ll128Enabled = comm->topo->ll128Enabled && allGather3Data[i].ll128Enabled;
     comm->topo->mscclEnabled = comm->topo->mscclEnabled && allGather3Data[i].mscclEnabled;
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+      graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+      graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+      graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+      graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+    }
+    if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
+    if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0;
   }
 
   comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
@@ -941,8 +1017,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   }
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail);
 
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nc), ret, fail);
   if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
 
   // AllGather3 - end
@@ -963,6 +1039,29 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
 
   //NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
 
+  // Compute nChannels per peer for p2p
+  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+  /* until now, all info of comm should be known. We can initialize shared resources and
+   * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+   * all proxy operations. */
+  if (comm->sharedRes->owner == comm) {
+    comm->sharedRes->tpNLocalRanks = comm->localRanks;
+    comm->sharedRes->magic = comm->magic;
+    comm->sharedRes->tpNChannels = comm->nChannels;
+    comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+    memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+  }
+  NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+  for (int i = 0; i < comm->localRanks; ++i) {
+    int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+    topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+  }
+  comm->topParentLocalRanks = topParentLocalRanks;
+
+  // Launch proxy service thread, after this, the proxy calls can be used.
+  //NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+
   // Connect with prev/next for each ring
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
@@ -993,39 +1092,46 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
   INFO(NCCL_INIT, "Connected all trees");
 
+#if 0
+  // Setup NVLS
+  NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+  // And NVLS trees if needed
+  if (comm->nvlsSupport && comm->localRanks > 1) {
+    for (int c=0; c<comm->nvlsChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail);
+    INFO(NCCL_INIT, "Connected NVLS tree");
+  }
+#endif
+#if CUDART_VERSION >= 12010
   // Check if we can setup CollNet
-  if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
+  if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph);
+#endif
 
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
-  do {
-    int myCompCap = comm->peerInfo[rank].cudaCompCap;
-    int minCompCap = myCompCap, maxCompCap = myCompCap;
-    for (int i = 0; i < nranks; i++) {
-      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
-      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
-    }
-    NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
-  } while(0);
-
-  // Compute nChannels per peer for p2p
-  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
 
   INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
 
 #if 0
   do { // Setup p2p structures in comm->tasks
     struct ncclTasks* tasks = &comm->tasks;
-    int nRanks = comm->nRanks;
     int node = comm->node;
     int nNodes = comm->nNodes;
     struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
     int localRank = comm->localRank;
-    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
-    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-    int s=0, r=0;
+    // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8.
+    int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2);
+    tasks->p2pOrderSteps = comm->nNodes * steps;
+    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, tasks->p2pOrderSteps);
+    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
+    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
+    int i=0;
     // schedule delta 0, +1, -1, +2, -2, ...
     // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
     for (int d=0; d <= nNodes/4; d++) {
@@ -1035,18 +1141,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
     sched_delta:
       int recvNode = (node+nNodes-delta)%nNodes;
       int sendNode = (node+delta)%nNodes;
-      int steps = comm->maxLocalRanks;
       for (int step=0; step < steps; step++) {
         int recvIndex = (localRank-step+steps)%steps;
-        if (recvIndex < nodeRanks[recvNode].localRanks) {
-          tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
-          r++;
-        }
+	int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+        tasks->p2pRecvOrder[i] = recvRank;
         int sendIndex = (localRank+step)%steps;
-        if (sendIndex < nodeRanks[sendNode].localRanks) {
-          tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
-          s++;
-        }
+        int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
+        tasks->p2pSendOrder[i] = sendRank;
+        i++;
       }
       index++;
       if (index == 1 && deltas[1] == deltas[0]) index++;
@@ -1058,7 +1160,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
         goto sched_delta;
       }
     }
-    assert(s == nRanks && r == nRanks);
+    assert(i == tasks->p2pOrderSteps);
   } while (0);
 
   if (ncclParamNvbPreconnect()) {
@@ -1070,35 +1172,37 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
       int channelId;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
+        if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
           comm->connectSend[peer] |= (1UL<<channelId);
         }
       }
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
+        if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
           comm->connectRecv[peer] |= (1UL<<channelId);
         }
       }
     }
+
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
   }
 #endif
   // Connect to local net proxy
-  //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
-  //NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+  tpProxyRank = comm->topParentRanks[comm->rank];
+  //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+  //NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
 
   // Then to remote ones when using PXN
   if (ncclPxnDisable(comm) == 0) {
     int nranks;
     NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
     for (int r=0; r<nranks; r++) {
-      //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
-      //NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+      tpProxyRank = comm->topParentRanks[pxnPeers[r]];
+      //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+      //NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
     }
   }
 
-
 #if 0
   if (comm->intraRank == 0) { // Load ncclParamLaunchMode
     char* str = getenv("NCCL_LAUNCH_MODE");
@@ -1129,8 +1233,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
 
 exit:
   //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  // Unlink proxy shm to make sure it will be properly cleaned up.
-  //ncclProxyShmUnlink(comm);
+  /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+   * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+   * properly cleaned up. */
+  //if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm);
   free(allTopoRanks);
   free(nodesTreePatterns);
   free(nodesFirstRank);