Merge remote-tracking branch 'nccl/master' into develop

2022-09-09 01:20:52 +00:00
commit a79d9e3586
@@ -175,12 +175,6 @@ set(CC_SOURCES
    src/collectives/all_to_all_api.cc
    src/collectives/all_to_allv_api.cc
    src/channel.cc
-    #src/clique/CliqueManager.cc     # RCCL
-    #src/clique/HandleCache.cc       # RCCL
-    #src/clique/HandleShm.cc         # RCCL
-    #src/clique/Hash.cc              # RCCL
-    #src/clique/MsgQueue.cc          # RCCL
-    #src/clique/ShmObject.cc         # RCCL
    src/misc/argcheck.cc
    src/misc/nvmlwrap_stub.cc
    src/misc/utils.cc
@@ -193,6 +187,8 @@ set(CC_SOURCES
    src/misc/signals.cc              # RCCL
    src/misc/socket.cc
    src/misc/param.cc
+    src/misc/rocmwrap.cc
+    src/misc/strongstream.cc
    src/transport/coll_net.cc
    src/transport/net.cc
    src/transport/net_ib.cc
@@ -208,7 +204,6 @@ set(CC_SOURCES
    src/enqueue.cc
    ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)

-
 foreach(filename ${CC_SOURCES})
  list(APPEND CPP_SOURCES ${filename})
 endforeach(filename)
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 12
-NCCL_PATCH   := 12
+NCCL_MINOR   := 13
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,7 +10,8 @@ include ../makefiles/version.mk
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
-		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
+		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
+		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -106,6 +106,7 @@ static void *bootstrapRoot(void* args) {
  do {
    struct ncclSocket sock;
    sock.abortFlag = NULL;
+    /* bootstrap root thread always uses blocking ncclSocketAccept. */
    NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
    close(sock.fd);
@@ -140,6 +141,7 @@ static void *bootstrapRoot(void* args) {
    int next = (r+1) % nranks;
    struct ncclSocket sock;
    sock.abortFlag = NULL;
+    sock.asyncFlag = 0;
    memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
@@ -289,7 +291,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));

-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);

  return ncclSuccess;
 }
@@ -324,6 +326,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct ncclSocket sock;
  sock.abortFlag = state->abortFlag;
+  sock.asyncFlag = 0;
  memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
  NCCLCHECK(ncclSocketConnect(&sock));
  NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,75 +8,54 @@
 #include "param.h"
 #include "gdrwrap.h"

-// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
-NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
-
-ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
-  struct ncclChannel* channel = comm->channels+channelid;
+ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
  if (channel->id != -1) return ncclSuccess;
-  channel->id = channelid;

-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+  int nRanks = comm->nRanks;
+  channel->id = channelId;
+  channel->workFifoSent = 0;

-  // Communication structures with peers.
-  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
-  for (size_t i=0; i<comm->nRanks+1; ++i) {
-    for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      channel->peers[i].send[b].comm = comm;
-      channel->peers[i].recv[b].comm = comm;
+  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
+
+  // The extra on nRanks+1 is for collnet root (i.e. network)
+  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devPeers);
+
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
+
+  for (int r=0; r < nRanks+1; ++r) {
+    for (int b=0; b < NCCL_MAX_CONNS; b++) {
+      channel->peers[r].send[b].comm = comm;
+      channel->peers[r].recv[b].comm = comm;
    }
  }

-  // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
-    // GDRCOPY support
-    // We allocate a workFifo in GDR mapped CUDA memory
-    // But we still allocate the Host workFifo so that we
-    // can copy the work elements to CUDA memory on kernel launch
-    NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
-  } else {
-    // The device workFifo is the Host one
-    channel->workFifoDev = channel->workFifo;
-  }
-
  return ncclSuccess;
 }

 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  if (channel->id == -1) return ncclSuccess;
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->workFifo));
-  if (channel->gdrMemDesc) {
-    // GDRCOPY support
-    NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
-  }
-
-  // Free Ring index to rank tables
-  free(channel->ring.userRanks);
-  CUDACHECK(hipFree(channel->ring.devUserRanks));

  // Free transport proxy resources
  // Note: free all send resources first due to CollNet arrangement
  for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
    }
  }
  for (int r=0; r<nRanks+1; r++) {
-    struct ncclPeer* peer = channel->peers+r;
+    struct ncclChannelPeer* peer = channel->peers+r;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
    }
  }

-  // Free the peer structures.
-  CUDACHECK(hipFree(channel->devPeers));
-  free(channel->peers);
-
  return ncclSuccess;
 }
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
 BUILDDIR ?= $(abspath ../../../build)
 OBJDIR := $(BUILDDIR)/obj/collectives/device

-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu alltoall_pivot.cu
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu

 LIBSRCFILES += functions.cu

@@ -13,11 +13,11 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem->channel.ring;
-    const int *ringRanks = ring->devUserRanks;
+    const int *ringRanks = ring->userRanks;
    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
@@ -8,7 +8,6 @@
 #include "devcomm.h"
 #include "collectives.h"
 #include "primitives.h"
-//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support

 #if defined(ENABLE_NPKIT)
 #include "npkit/npkit.h"
@@ -18,7 +17,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem->channel.ring;
@@ -187,11 +186,6 @@ namespace {
      }
 #endif

-      // Make final copy from buffer to dest.
-      chunk = modRanks(ringIx + 1);
-      offset = calcOffset(chunk);
-      nelem = min(realChunkSize, size-offset);
-
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
      if (tid == 0) {
        NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
@@ -200,6 +194,10 @@ namespace {
      }
 #endif

+      // Make final copy from buffer to dest.
+      chunk = modRanks(ringIx + 1);
+      offset = calcOffset(chunk);
+      nelem = min(realChunkSize, size-offset);
      prims.directRecv(offset, nelem);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
@@ -223,7 +221,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclTree *tree = &ncclShmem->channel.tree;
@@ -375,7 +373,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclTree *tree = &ncclShmem->channel.tree;
@@ -600,9 +598,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
    const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
    const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
    const int nThreadsScatter = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
-    const int nThreadsGather  = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
+    const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
    const int nThreadsBcast   = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
    const int tidStartBcast = nThreadsGather;
    const int tidStartScatter = tidStartBcast + nThreadsBcast;
    const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -8,4 +8,4 @@
 #include "common.h"
 #include "collectives.h"

-IMPL_COLL_ALLTOALL_PIVOT(AllToAllPivot);
+IMPL_COLL_F(AllToAllPivot);
@@ -12,7 +12,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nranks = ncclShmem->comm.nRanks;
    const ncclRing *ring = &ncclShmem->channel.ring;
@@ -29,11 +29,11 @@ namespace {
    const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->connIndex << 16);
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);

    for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) {
-      const int src_rank = ring->devUserRanks[(nranks - num_hops) % nranks];
-      const int dst_rank = ring->devUserRanks[num_hops];
+      const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
+      const int dst_rank = ring->userRanks[num_hops];
      const ssize_t send_offset =
          dst_rank * num_elems * elem_size + chunk_offset +
          (src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
@@ -12,7 +12,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem->channel.ring;
@@ -20,8 +20,8 @@ namespace {
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
    const ssize_t loopSize = nChannels*chunkSize;
    const ssize_t size = args->count;
-    const int rank = ring->devUserRanks[0];
-    const int nextRank = ring->devUserRanks[1];
+    const int rank = ring->userRanks[0];
+    const int nextRank = ring->userRanks[1];
    const int root = args->root;

    T *inputBuf = (T*)args->sendbuff;
@@ -10,7 +10,6 @@

 #include "collectives.h"
 #include "devcomm.h"
-#include "op128.h"

 #define COLL_UNROLL 2
 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1)  // Using balanced tree instead of split tree
@@ -320,154 +319,71 @@ class ncclFunction {
 };

 #ifdef ENABLE_COLLTRACE
-#define traceColl(elem,launch_type) \
+#define traceColl(launch_type) { \
    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-    shmem.comm.collTrace[pos].bid = blockIdx.x; \
-    shmem.comm.collTrace[pos].funcIndex = shmem.work.header.funcIndex; \
-    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (shmem.comm.collTrace[pos].data_0)); \
-    if (elem.header.type == ncclWorkTypeP2p) { \
-      struct ncclWorkElemP2p *p2pElems = (struct ncclWorkElemP2p *)&elem; \
-      shmem.comm.collTrace[pos].p2p[0].connIndex = p2pElems[0].connIndex; \
-	    shmem.comm.collTrace[pos].p2pOpCount[0] = p2pElems[0].opCount; \
-      shmem.comm.collTrace[pos].p2p[0].ngroups = p2pElems[0].ngroups; \
-      shmem.comm.collTrace[pos].p2p[0].nWarps = p2pElems[0].nWarps; \
-      shmem.comm.collTrace[pos].p2p[0].warpStart = p2pElems[0].warpStart; \
-      shmem.comm.collTrace[pos].p2p[0].peer = (uint16_t)(p2pElems[0].peer); \
-	    shmem.comm.collTrace[pos].p2p[1].connIndex = p2pElems[1].connIndex; \
-      shmem.comm.collTrace[pos].p2pOpCount[1] = p2pElems[1].opCount; \
-      shmem.comm.collTrace[pos].p2p[1].ngroups = p2pElems[1].ngroups; \
-      shmem.comm.collTrace[pos].p2p[1].nWarps = p2pElems[1].nWarps; \
-      shmem.comm.collTrace[pos].p2p[1].warpStart = p2pElems[1].warpStart; \
-      shmem.comm.collTrace[pos].p2p[1].peer = (uint16_t)(p2pElems[1].peer); \
-      shmem.comm.collTrace[pos].type = (ncclCollTraceP2pElemType|launch_type); \
-    } else { \
-      shmem.comm.collTrace[pos].opCount = elem.opCount; \
-      shmem.comm.collTrace[pos].coll.nWarps = elem.header.nWarps; \
-      shmem.comm.collTrace[pos].coll.bid = elem.bid; \
-      shmem.comm.collTrace[pos].coll.nChannels = elem.nChannels; \
-      shmem.comm.collTrace[pos].type = (ncclCollTraceCollElemType|launch_type); \
-    }
+    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    collTrace->bid = blockIdx.x; \
+    collTrace->funcIndex = shmem.work.header.funcIndex; \
+    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
+    if (shmem.work.header.type == ncclWorkTypeP2p) { \
+      struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
+      collTrace->p2p[0].connIndex = 0; \
+	    collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
+      collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
+      collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
+      collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
+      collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
+	    collTrace->p2p[1].connIndex = 0; \
+      collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
+      collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
+      collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
+      collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
+      collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
+      collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
+    } else if (shmem.work.header.type == ncclWorkTypeColl) { \
+      struct ncclWorkElem *elems = shmem.work.elems; \
+      collTrace->opCount = elems[0].opCount; \
+      collTrace->coll.nWarps = elems[0].nWarps; \
+      collTrace->coll.bid = elems[0].bid; \
+      collTrace->coll.nChannels = elems[0].nChannels; \
+      collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
+    } \
+  }

-#define traceKernelLaunch(elem,firstLaunch)  { \
-    traceColl(elem,(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType)); \
+#define traceKernelLaunch(firstLaunch)  { \
+    traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
  }
 #define traceKernelEnd()  { \
    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-    shmem.comm.collTrace[pos].bid = bid; \
-    shmem.comm.collTrace[pos].type = ncclCollTraceKernelEndType; \
+    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    collTrace->bid = blockIdx.x; \
+    collTrace->type = ncclCollTraceKernelEndType; \
  }
 #define traceAbort()  { \
    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-    shmem.comm.collTrace[pos].bid = bid; \
-    shmem.comm.collTrace[pos].type = ncclCollTraceAbortType; \
+    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    collTrace->bid = blockIdx.x; \
+    collTrace->type = ncclCollTraceAbortType; \
  }
 //  traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
 #define traceData(data2, data4, data8_0, data8_1) { \
-    uint32_t pos = atomicAdd(ncclShmem->comm.collTraceTail, 1)%COLLTRACE_NUM_ITEMS; \
-    ncclShmem->comm.collTrace[pos].bid = blockIdx.x; \
-    ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-    ncclShmem->comm.collTrace[pos].funcIndex = data2; \
-    ncclShmem->comm.collTrace[pos].data_0 = data4; \
-    ncclShmem->comm.collTrace[pos].opCount = data8_0; \
-    ncclShmem->comm.collTrace[pos].data_1 = data8_1; \
-    ncclShmem->comm.collTrace[pos].type = ncclCollTraceDataType; \
+    uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
+    collTrace->bid = blockIdx.x; \
+    collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
+    collTrace->funcIndex = data2; \
+    collTrace->data_0 = data4; \
+    collTrace->opCount = data8_0; \
+    collTrace->data_1 = data8_1; \
+    collTrace->type = ncclCollTraceDataType; \
  }
 #else
-#define traceKernelLaunch()
-#define traceAbort()
 #define traceData(data2, data4, data8_0, data8_1)
 #endif

-#ifdef ENABLE_PROFILING
-#define __insert_timestamp(line_num) do { \
-      if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
-        shmem.prof.elem[shmem.prof.count].line = line_num; \
-        shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-        shmem.prof.count++; \
-      } \
-    } while(0);
-#else
-#define __insert_timestamp(line_num)
-#endif
-
-// Copy src to dst and fill extra size with zeroes
-template<typename Tdst, typename Tsrc>
-__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
-  static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
-      "copyToShmem needs sizes which are multiple of 16B");
-  static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
-  static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  uint64_t *shmemPtr = d;
-  int offset = 2*tid;
-  uint64_t v0, v1;
-  if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
-    v0 = v1 = 0ULL;
-  } else {
-    v0 = s[offset] ; v1 = s[offset+1];
-  }
-  if (offset < sizeof(Tdst)/sizeof(uint64_t)) {
-    shmemPtr[offset] = v0; shmemPtr[offset+1] = v1;
-  }
-}
-
-template<typename T>
-__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
-  static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
-  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
-  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
-  int t = threadIdx.x - turn;
-  if (t < 0) t += blockDim.x;
-  int n = sizeof(T)/sizeof(uint64_t);
-
-  int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0
-  if (delta < blockDim.x) {
-    turn += delta;
-    if (turn >= blockDim.x) turn -= blockDim.x;
-  }
-  else
-    turn = 0;
-
-  n -= t;
-  d += t;
-  s += t;
-  #pragma unroll
-  for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) {
-    if (n > 0) {
-      *d = *s;
-      d += blockDim.x;
-      s += blockDim.x;
-      n -= blockDim.x;
-    }
-  }
-  return turn;
-}
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWorkElement {
-  __device__ void run(ncclWorkElem*) {
-    // Put NOT IMPLEMENTED behavior here.
-  }
-};
-
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWork {
-  // This __forceinline__ is necessary. The compiler was inserting a function call
-  // here from the LL ncclKernel.
-  __device__ __forceinline__ void run(ncclWork *w) {
-    int wid = threadIdx.x / WARP_SIZE;
-    int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
-    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
-      if (wid < w->header.nWarps)
-        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
-    }
-  }
-};

 struct ncclShmemGroup {
  ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
@@ -484,18 +400,67 @@ struct ncclShmemData {
    struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
  };
  uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
-  struct ncclDevComm comm;
-  struct ncclChannel channel;
-  uint64_t pad[2];
-  struct ncclWork work;
+  int channelId;
+  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclDevChannel channel;
+  alignas(16) struct ncclWork work;
 #ifdef ENABLE_PROFILING
  struct ncclProf prof;
 #endif
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");

+#ifdef ENABLE_PROFILING
+#define __insert_timestamp(line_num) do { \
+      if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
+        shmem.prof.elem[shmem.prof.count].line = line_num; \
+        shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+        shmem.prof.count++; \
+      } \
+    } while(0);
+#else
+#define __insert_timestamp(line_num)
+#endif
+
+// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
+inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
+  int offset = 16*tid;
+  if (offset < bytes) {
+    ulong2 *src2, *dst2;
+    src2 = (ulong2*)((char const*)src + offset);
+    dst2 = (ulong2*)((char*)dst + offset);
+    dst2->x = src2->x;
+    dst2->y = src2->y;
+  }
+}
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWorkElement {
+  __device__ void run(ncclWorkElem*) {
+    // Put NOT IMPLEMENTED behavior here.
+  }
+};
+
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWork {
+  // This __forceinline__ is necessary. The compiler was inserting a function call
+  // here from the LL ncclKernel.
+  __device__ __forceinline__ void run(ncclWork *w) {
+    int wid = threadIdx.x / WARP_SIZE;
+    ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
+    int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
+    #pragma unroll 1
+    while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
+      if (wid < we->nWarps) {
+        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
+      }
+      we = (ncclWorkElem*)((char*)we + stride);
+    }
+  }
+};
+
 static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
-  if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+  if (we->isUsed && we->redOpArgIsPtr) {
    /* redOpArg is a pointer to the scalar value, so we'll dereference it
     * here so that redOpArg holds the bits of the scalar going forward.
     * The tricky thing is we don't know its type T since that's encoded in
@@ -518,10 +483,10 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
 extern __device__ struct ncclShmemData *ncclShmem;

 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
-__device__ void ncclKernel(struct ncclDevComm* comm)  {
+__device__ void ncclKernel(
+    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
+  )  {
  int tid = threadIdx.x;
-  int nthreads = blockDim.x;
-  int bid = blockIdx.x;
  __shared__ struct ncclShmemData shmem;
  ncclShmem = &shmem;
  if (tid == 0) {
@@ -529,47 +494,72 @@ __device__ void ncclKernel(struct ncclDevComm* comm)  {
      shmem.groups[i].barrier = 0;
      for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
    }
- }
-  __syncthreads();
+  }
+  // To map blockId to channelId, we need the n'th set bit of channelMask which
+  // is the inverse of counting the number of set bits among the the first n.
+  if (tid < WARP_SIZE) {
+    int x = tid;
+    if (channelMask & (1ull<<x)) {
+      int y = __popcll(channelMask & ((1ull<<x)-1));
+      if (blockIdx.x == y) shmem.channelId = x;
+    }
+    if (32 < MAXCHANNELS) {
+      x = 32 + tid;
+      if (channelMask & (1ull<<x)) {
+        int y = __popcll(channelMask & ((1ull<<x)-1));
+        if (blockIdx.x == y) shmem.channelId = x;
+      }
+    }
+  }
+  __syncthreads(); // publish shmem.channelId
+  int channelId = shmem.channelId;

-  int turn = copyToShmem(&shmem.comm, comm);
+  if (true) {
+    void *dst, *src;
+    int bytes;
+    // Use first 3 warps to load comm, channel, and work into shmem
+    switch (tid/WARP_SIZE) {
+    case 0:
+      dst = &shmem.comm;
+      src = comm;
+      bytes = sizeof(ncclDevComm);
+      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      break;
+    case 1:
+      // Get address of channel without incurring indirect load from ncclDevComm::channels
+      dst = &shmem.channel;
+      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
+      bytes = sizeof(ncclDevChannel);
+      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
+      break;
+    case 2:
+      dst = &shmem.work;
+      src = workHead + blockIdx.x;
+      bytes = sizeof(ncclWork);
+      static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
+      break;
+    default:
+      bytes = 0;
+      break;
+    }
+    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+  }
+  __syncthreads(); // publish shmem
 #ifdef ENABLE_PROFILING
  if (tid == 0) {
    shmem.prof.count = 0;
-    shmem.prof.seq = shmem.comm.devProf[bid].seq;
+    shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
  }
 #endif
  if (tid == 0) __insert_timestamp(__LINE__);
-  // get address of channel without incurring indirect load from ncclDevCom::channels
-  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
-  turn = copyToShmem(&shmem.channel, channel, turn);

-  __syncthreads(); // publish ncclShmem
-  if (tid == 0) __insert_timestamp(__LINE__);
-  if (tid == 0) __insert_timestamp(__LINE__);
-
-  ncclWork *workFifoHost = shmem.channel.workFifo;
-  ncclWork *workFifoDev = shmem.channel.workFifoDev;
-  int workFifoIx = shmem.channel.index;
-  bool firstLaunch = true;
+  if (COLLTRACE && tid == 0) traceKernelLaunch(true);

  while (true) {
-    copyToShmem(&shmem.work, &workFifoDev[workFifoIx], tid, nthreads);
-    if (tid == 0) __insert_timestamp(__LINE__);
-    { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (__any(aborted)) { // publish shmem.work
-        if (COLLTRACE && tid == 0) traceAbort();
-        break;
-      }
-      if (tid == 0)
-        workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
+    // Notify host that all fifo reads are complete.
+    if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
+      *shmem.channel.workFifoDone = shmem.work.header.doneAcks;
    }
-    if (tid == 0) __insert_timestamp(__LINE__);
-
-    workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
-    if (tid == 0)
-      channel->index = workFifoIx; // write back to real channel, not shmem shadow

    __syncwarp();
    if (shmem.work.header.type == ncclWorkTypeColl) {
@@ -579,52 +569,57 @@ __device__ void ncclKernel(struct ncclDevComm* comm)  {
    }
    __syncthreads();

-    if (COLLTRACE && tid == 0) {
-      traceKernelLaunch(shmem.work.elems[0],firstLaunch);
-      firstLaunch = false;
-      #pragma unroll 1
-      for(int e=1; e < NCCL_MAX_WORK_ELEMENTS && shmem.work.elems[e].header.type != ncclWorkTypeUnused; e ++) {
-        traceColl(shmem.work.elems[e], 0);
+    if (tid == 0) __insert_timestamp(__LINE__);
+    if (shmem.work.header.funcIndex == FnIndex) {
+      RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
+    } else {
+      NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
+    }
+
+    int workIxNext = shmem.work.header.workNext;
+    __syncthreads();
+    if (shmem.work.header.isLast) break;
+
+    copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
+
+    { // Check whether the last operation was aborted and make sure all threads exit
+      int aborted = tid == 0 ? *comm->abortFlag : 0;
+      if (__any(aborted)) { // publish shmem.work
+        traceAbort();
+        break;
      }
    }
-    if (tid == 0) __insert_timestamp(__LINE__);
-    if (shmem.work.header.funcIndex == FnIndex)
-      RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
-    else
-      NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
-
-    if (shmem.work.header.isLast) break;
-    __syncthreads();
+    if (COLLTRACE && tid == 0) traceColl(false);
  }
-  if (COLLTRACE && tid == 0) traceKernelEnd()
+  if (COLLTRACE && tid == 0) traceKernelEnd();
 #ifdef ENABLE_PROFILING
  if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
    __syncthreads();
-    copyToShmem(shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof);
-    if (tid == 0) shmem.comm.devProf[bid].seq++;
+    copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
+    if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
  }
 #endif
 }

 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm, channelMask, workHead); \
 } \
 \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm); \
+__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm, channelMask, workHead); \
 } \
 \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm); \
+__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm, channelMask, workHead); \
 } \
 \
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
-__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm); \
+__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm, channelMask, workHead); \
 }

 // Examples :     AllReduce, RING, LL,    Sum,   uint8
@@ -683,7 +678,7 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);

 // AllToAll Pivot primitive only has one function.
-#define IMPL_COLL_ALLTOALL_PIVOT(func) \
+#define IMPL_COLL_F(func) \
  IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);

 #endif
@@ -23,7 +23,7 @@ __device__ struct ncclShmemData* ncclShmem;
  NCCL_FUNC5(func, RING,    devredop, type, nullify), \
  NCCL_FUNC5(func, COLLNET, devredop, type, nullify)

-#if defined(RCCL_BFLOAT16)
+#if defined(__CUDA_BF16_TYPES_EXIST__)
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(func, devredop, nullForFloat) \
  NCCL_FUNC4(func, devredop, int8_t, 0), \
@@ -35,7 +35,7 @@ __device__ struct ncclShmemData* ncclShmem;
  NCCL_FUNC4(func, devredop, half, nullForFloat), \
  NCCL_FUNC4(func, devredop, float, nullForFloat), \
  NCCL_FUNC4(func, devredop, double, nullForFloat), \
-  NCCL_FUNC4(func, devredop, rccl_bfloat16, nullForFloat)
+  NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
 #define NCCL_FUNCS3B(func, devredop) \
  NCCL_FUNC4(func, devredop, int8_t, 0), \
  NCCL_FUNC4(func, devredop, int8_t, 0), \
@@ -89,13 +89,12 @@ __device__ struct ncclShmemData* ncclShmem;
  NCCL_FUNCS3B(func, Sum)

 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
 #if __CUDA_ARCH__
  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  NCCL_FUNC_NAME(AllToAllPivot, RING, SIMPLE, Sum, int8_t),
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
@@ -105,8 +104,8 @@ __device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedO
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
  NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
-  #if defined(RCCL_BFLOAT16)
-    NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16),
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+    NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
  #endif
  NCCL_FUNCS2B(Broadcast),
  NCCL_FUNCS2A(Reduce),
@@ -17,7 +17,7 @@ namespace {
    int tid = threadIdx.x;
    int tn = blockDim.x;
    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
      ncclWorkElem *we = &w->elems[e];
      intptr_t eltN = we->count;
      int bid = we->bid;
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,8 +7,6 @@
 #ifndef OP128_H_
 #define OP128_H_

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-#else
 inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
      : "=l"(v0), "=l"(v1) : "l"(ptr));
@@ -67,6 +64,5 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
  v0 = tmp8[0];
  v1 = tmp8[1];
 }
-#endif

 #endif
@@ -155,5 +155,4 @@ struct PrimitivesWithoutDirect {
 #include "prims_simple.h"
 #include "prims_ll.h"
 #include "prims_ll128.h"
-
 #endif
@@ -183,7 +183,7 @@ private:

  template<int BeginIx>
  __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
-    #pragma unroll
+    #pragma unroll 1
    for (int i=BeginIx; i < MaxRecv; i++) {
      if (i < fan.nrecv()) {
        union ncclLLFifoLine* src = recvPtr(i) + offset;
@@ -412,7 +412,7 @@ private:
      }
      if (RECV) {
        data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
-        #pragma unroll MaxRecv
+        #pragma unroll 1
        for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
          peerData = readLLFinish(offset, line, i);
          data = MULTI<RedOp,T>()(redOp, peerData, data);
@@ -502,11 +502,11 @@ private:
    // If we are going to support oneshot collNet + LL, then we would need to add connector index here
    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
      nrecv++;
    }
    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
      nsend++;
    }
    this->fan = Fan(nrecv, nsend);
@@ -5,11 +5,12 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "op128.h"
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit.h"
+#endif

 #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)

-
 template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
 class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
@@ -53,6 +54,15 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
  uint64_t* barriers;
  uint64_t* barrier_next;

+#if defined(ENABLE_NPKIT)
+public:
+  int npKitCtxIdx = 0;
+  uint64_t npKitDataProcessEntryTime = 0;
+  uint64_t npKitDataProcessExitTime = 0;
+  uint64_t npKitDataProcessTotalTime = 0;
+private:
+#endif
+
  inline __device__ void barrier() {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  if (nthreads != WARP_SIZE)
@@ -405,11 +415,11 @@ public:
    auto *channel = &ncclShmem->channel;
    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
-      loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
+      loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
      nrecv++;
    }
    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
-      loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
+      loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
      nsend++;
    }
    this->fan = Fan(nrecv, nsend);
@@ -50,7 +50,6 @@ class Primitives<
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
  uint64_t* barriers;
  uint64_t* barrier_next;
-  const uint64_t opCount;
  uint32_t* next_hdp_reg;

 #if defined(ENABLE_NPKIT)
@@ -377,6 +376,7 @@ private:
          waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
          subBarrier();
          #pragma unroll 1
+          // Loop over peers
          for (int j=0; j<fan.nsend(); j++) {
            int i = (j+shift)%fan.nsend();
            int peerOffset = i*peerElem;
@@ -423,9 +423,9 @@ private:
    }
  }

-  __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
    if (flags & (RoleWaitRecv|RolePostRecv)) {
-      auto *conn = &peer->recv[connIndex].conn;
+      auto *conn = &peer->recv[connIndex];
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostRecv) {
@@ -463,14 +463,14 @@ private:
    }
  }

-  __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
    if (flags & (RoleWaitSend|RolePostSend)) {
-      auto *conn = &peer->send[connIndex].conn;
+      auto *conn = &peer->send[connIndex];
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostSend) {
        connStepPtr = conn->tail;
-	next_hdp_reg = conn->next_hdp_reg;
+	    next_hdp_reg = conn->next_hdp_reg;
      }
      if (flags & RoleWaitSend) {
        ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
@@ -513,8 +513,7 @@ private:
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
    ):
    tid(tid),
-    stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)),
-    opCount(ncclShmem->work.elems[0].opCount) {
+    stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {

    // For send operations, we need an extra warp to overlap the threadfence and the copy
    this->nthreads = nthreads;
@@ -552,8 +551,8 @@ private:
    if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
    if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];

-    loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
-    loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
+    loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
+    loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);

    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
  }
@@ -13,7 +13,7 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem->channel.ring;
@@ -23,7 +23,7 @@ namespace {
    const ssize_t loopSize = nChannels*chunkSize;
    const ssize_t size = args->count;
    const int rank = ncclShmem->comm.rank;
-    const int prevRank = ring->devUserRanks[nranks-1];
+    const int prevRank = ring->userRanks[nranks-1];
    const int root = args->root;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
@@ -13,11 +13,11 @@ namespace {
  template<typename T, typename RedOp, typename Proto>
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int nthreads = args->nWarps*WARP_SIZE;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclRing *ring = &ncclShmem->channel.ring;
-    int const *ringRanks = ring->devUserRanks;
+    int const *ringRanks = ring->userRanks;
    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
@@ -15,6 +15,8 @@
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+    size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);

 #if defined(ENABLE_NPKIT)
    bool isNpKitThread = (tid == 0);
@@ -38,34 +40,35 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {

    if (args->peer == ncclShmem->comm.rank) {
      struct ncclWorkElemP2p* recvArgs = args-1;
-      if (args->buff != recvArgs->buff) {
+      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
+      if (buff != recvBuff) {

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
        if (isNpKitThread) {
-          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
+          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
        }
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
        if (isNpKitThread) {
-          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
+          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
        }
 #endif

-        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
        if (isNpKitThread) {
-          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
+          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
        }
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
        if (isNpKitThread) {
-          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
+          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
        }
 #endif
@@ -73,11 +76,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      }
    } else {
      using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
      int const chunkSize = args->chunkSize/sizeof(T);
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);

 #if defined(ENABLE_NPKIT)
      if (isNpKitThread) {
@@ -93,9 +95,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      }
 #endif

-      ssize_t offset = 0;
+      size_t offset = 0;
      do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
        prims.directSend(offset, offset, nelem);
        offset += nelem;
      } while(offset < count);
@@ -133,11 +135,12 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {

    if (args->peer != ncclShmem->comm.rank) {
      using Proto = ProtoSimple<1, 1>;
-      ssize_t const count = args->count;
+      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
      int const chunkSize = args->chunkSize/sizeof(T);
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);

 #if defined(ENABLE_NPKIT)
      if (isNpKitThread) {
@@ -153,9 +156,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      }
 #endif

-      ssize_t offset = 0;
+      size_t offset = 0;
      do {
-        int nelem = min(chunkSize, count-offset);
+        int nelem = min(size_t(chunkSize), count-offset);
        prims.directRecv(offset, nelem);
        offset += nelem;
      } while(offset < count);
@@ -182,11 +185,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
    int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
    args += group;
-    if (args->header.type == ncclWorkTypeUnused) return;
-
    tid -= args->warpStart * WARP_SIZE;
    int nthreads = args->nWarps * WARP_SIZE;
    group |= (args->connIndex<<16); // Used to select connIndex 1
+
+    if (args->p2pType == ncclWorkP2pTypeUnused) return;
    if (tid >= nthreads || args->peer == -1) return;
    if ((group%2) == 0) {
      runRecv(tid, nthreads, group, args);
@@ -9,29 +9,37 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <sys/syscall.h>

 int ncclDebugLevel = -1;
+static int pid = -1;
+static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
+char ncclLastError[1024] = ""; // Global string for the last error in human readable form
 uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
 FILE *ncclDebugFile = stdout;
 pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+std::chrono::steady_clock::time_point ncclEpoch;
+
+static __thread int tid = -1;

 void ncclDebugInit() {
  pthread_mutex_lock(&ncclDebugLock);
  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
  const char* nccl_debug = getenv("NCCL_DEBUG");
+  int tempNcclDebugLevel = -1;
  if (nccl_debug == NULL) {
-    ncclDebugLevel = NCCL_LOG_NONE;
+    tempNcclDebugLevel = NCCL_LOG_NONE;
  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = NCCL_LOG_VERSION;
+    tempNcclDebugLevel = NCCL_LOG_VERSION;
  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = NCCL_LOG_WARN;
+    tempNcclDebugLevel = NCCL_LOG_WARN;
  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = NCCL_LOG_INFO;
+    tempNcclDebugLevel = NCCL_LOG_INFO;
  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = NCCL_LOG_ABORT;
+    tempNcclDebugLevel = NCCL_LOG_ABORT;
  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = NCCL_LOG_TRACE;
+    tempNcclDebugLevel = NCCL_LOG_TRACE;
  }

  /* Parse the NCCL_DEBUG_SUBSYS env var
@@ -65,6 +73,8 @@ void ncclDebugInit() {
        mask = NCCL_ENV;
      } else if (strcasecmp(subsys, "ALLOC") == 0) {
        mask = NCCL_ALLOC;
+      } else if (strcasecmp(subsys, "CALL") == 0) {
+        mask = NCCL_CALL;
      } else if (strcasecmp(subsys, "ALL") == 0) {
        mask = NCCL_ALL;
      }
@@ -76,12 +86,16 @@ void ncclDebugInit() {
    free(ncclDebugSubsys);
  }

+  // Cache pid and hostname
+  getHostName(hostname, 1024, '.');
+  pid = getpid();
+
  /* Parse and expand the NCCL_DEBUG_FILE path and
   * then create the debug file. But don't bother unless the
   * NCCL_DEBUG level is > VERSION
   */
  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
-  if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
+  if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
    int c = 0;
    char debugFn[PATH_MAX+1] = "";
    char *dfn = debugFn;
@@ -95,12 +109,10 @@ void ncclDebugInit() {
          *dfn++ = '%';
          break;
        case 'h': // %h = hostname
-          char hostname[1024];
-          getHostName(hostname, 1024, '.');
          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
          break;
        case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
          break;
        default: // Echo everything we don't understand
          *dfn++ = '%';
@@ -111,15 +123,15 @@ void ncclDebugInit() {
    *dfn = '\0';
    if (debugFn[0] != '\0') {
      FILE *file = fopen(debugFn, "w");
-      if (file != NULL) {
+      if (file != nullptr) {
+        setbuf(file, nullptr); // disable buffering
        ncclDebugFile = file;
      }
    }
  }

-#ifdef ENABLE_TRACE
-  ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
+  ncclEpoch = std::chrono::steady_clock::now();
+  __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
  pthread_mutex_unlock(&ncclDebugLock);
 }

@@ -128,45 +140,53 @@ void ncclDebugInit() {
 * they can share the debugging mechanisms and output files
 */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
+
+  // Save the last error (WARN) as a human readable string
+  if (level == NCCL_LOG_WARN) {
+    pthread_mutex_lock(&ncclDebugLock);
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
+    va_end(vargs);
+    pthread_mutex_unlock(&ncclDebugLock);
+  }
  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;

-  // Gather the rank information. This can take > 1us so we want to make sure
-  // we only do it when needed.
-  char hostname[1024];
-  getHostName(hostname, 1024, '.');
+  if (tid == -1) {
+    tid = syscall(SYS_gettid);
+  }
+
  int cudaDev;
-  hipGetDevice(&cudaDev);
-  int pid = getpid();
-  int tid = syscall(SYS_gettid);
+  if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
+    hipGetDevice(&cudaDev);
+  }

  char buffer[1024];
  size_t len = 0;
-  pthread_mutex_lock(&ncclDebugLock);
-  if (level == NCCL_LOG_WARN)
-    len = snprintf(buffer, sizeof(buffer),
-        "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO)
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
-#ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE) {
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+  if (level == NCCL_LOG_WARN) {
+    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
+                   hostname, pid, tid, cudaDev, filefunc, line);
+  } else if (level == NCCL_LOG_INFO) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+  } else if (level == NCCL_LOG_TRACE) {
+    auto delta = std::chrono::steady_clock::now() - ncclEpoch;
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer),
-        "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
+                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
  }
-#endif
+
  if (len) {
    va_list vargs;
    va_start(vargs, fmt);
-    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
    va_end(vargs);
-    fprintf(ncclDebugFile,"%s\n", buffer);
-    fflush(ncclDebugFile);
+    buffer[len++] = '\n';
+    fwrite(buffer, 1, len, ncclDebugFile);
  }
-  pthread_mutex_unlock(&ncclDebugLock);
 }

 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
@@ -448,10 +448,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);

 // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
 // remote proxies without risking deadlocks
-int ncclPxnDisable() {
+int ncclPxnDisable(struct ncclComm* comm) {
  static int pxnDisable = -1;
  if (pxnDisable == -1) {
-    if (ncclNetVersion() == 4) {
+    if (comm && ncclNetVersion(comm) == 4) {
      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
      pxnDisable = 1;
    } else {
@@ -490,7 +490,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
  return ncclSuccess;
 }

-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
  // Precompute paths between GPUs/NICs.

  // Remove everything in case we're re-computing
@@ -518,16 +518,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
      }
    }

-    if (peerInfos == NULL) continue;
+    if (comm == NULL) continue;
    // Remove GPUs we can't talk to because of containers.
-    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank[0];
+    struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
    for (int p=0; p<system->nodes[GPU].count; p++) {
      if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank[0];
+      struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
      int shm;
-      NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
      int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
      if (shm == 0 && p2p == 0) {
        // Mark this peer as inaccessible. We'll trim it later.
        system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
@@ -543,7 +543,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
    for (int g=0; g<system->nodes[GPU].count; g++) {
      // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+      if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
        int pxnGpu = -1;

        for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -556,7 +556,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
          pxnGpu = p;

          int netDev;
-
          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev));
          // To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
          if (netDev == netNode->id) break;
@@ -602,8 +601,8 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
    }
    for (int j=0; j<gpu->gpu.nRanksPerGpu; j++ ) {
      if (gpu->gpu.rank[j] == comm->rank) {
-	myDomain = domains[g];
-	break;
+        myDomain = domains[g];
+        break;
      }
    }
  }
@@ -768,7 +767,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  // We want to spread channels used when there aren't many and progressively
  // fill the whole space of nChannels. To do so we mirror the bits in the
  // nChannels space.
-  for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+  for (int c=0; c<comm->p2pnChannels; c++) {
    int mirror = 0;
    for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
    comm->p2pChannels[c] = mirror;
@@ -275,8 +275,8 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
  for (int i=0; i<ngpus; i++) {
    for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
      if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) {
-	*g = i;
-	return ncclSuccess;
+	    *g = i;
+	    return ncclSuccess;
      }
    }
  }
@@ -1103,10 +1103,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
    *proxyRank = rank;

-    int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+    int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
    // See whether we can use the remote rank preferred device.
    if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
-      int netDev = comm->peerInfo[peerRank].netDev;
+      // Find local NIC number close to local cudaDev
+      int cudaDev = comm->peerInfo[peerRank].cudaDev;
+      int localRank;
+      if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
+      int netDev = comm->peerInfo[localRank].netDev;
      int n;
      // Check that device exists on our node
      if (ncclParamCrossNic() == 0) {
@@ -724,11 +724,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
  int netDevCount = 0;
-  if (collNetSupport()) {
-    NCCLCHECK(collNetDevices(&netDevCount));
+  if (collNetSupport(comm)) {
+    NCCLCHECK(collNetDevices(comm, &netDevCount));
    for (int n=0; n<netDevCount; n++) {
      ncclNetProperties_t props;
-      NCCLCHECK(collNetGetProperties(n, &props));
+      NCCLCHECK(collNetGetProperties(comm, n, &props));
      struct ncclXmlNode* netNode;
      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -737,16 +737,18 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
    }
  }
  if (netDevCount == 0) {
-    NCCLCHECK(ncclNetDevices(&netDevCount));
+    NCCLCHECK(ncclNetDevices(comm, &netDevCount));
  }
  for (int n=0; n<netDevCount; n++) {
    ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(n, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, n, &props));
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -756,7 +758,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
  }

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
@@ -903,8 +907,8 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
  for (int g=0; g<system->nodes[GPU].count; g++) {
    for ( int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){
      if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
-	*localRank = g;
-	return ncclSuccess;
+	    *localRank = g;
+	    return ncclSuccess;
      }
    }
  }
@@ -198,20 +198,31 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
  for (int i=0; i<system->nodes[GPU].count; i++) {
    for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
      if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) {
-	*index = i;
-	return ncclSuccess;
+	    *index = i;
+	    return ncclSuccess;
      }
    }
  }
  return ncclInternalError;
 }

+static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
+  *rank = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
+      *rank = system->nodes[GPU].nodes[i].gpu.rank[0];
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
+
 // Returns XGMI speed in GB/s
 static float ncclTopoXGMISpeed(int gcn) {
  return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
 }

 #define ncclGetKernelIndex(p_comm) \
-  (((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->hostDevComm.collTraceThread ? 1 : 0))
+  (((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->collTraceThread ? 1 : 0))

 #endif
@@ -235,11 +235,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads);
  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
 #else
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
@@ -11,446 +11,262 @@
 #include "transport.h"
 #include "channel.h"

-#define MAX_ASYNC_OPS 128
-thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
-thread_local int ncclGroupIndex = 0;
-thread_local int ncclGroupMode = 0;
-thread_local ncclResult_t ncclGroupError = ncclSuccess;
-extern struct allocationTracker allocTracker[];
+__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
+__thread ncclResult_t ncclGroupError = ncclSuccess;
+__thread struct ncclComm* ncclGroupCommHead = nullptr;
+__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
+__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;

-bool ncclAsyncMode() {
-  return ncclGroupMode > 0;
-}
-
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
-  if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
-  return ret;
-}
-
-struct ncclInitArgs {
-  ncclInitFunc_t func;
-  int cudaDev;
-  ncclComm_t* newcomm;
-  int ndev;
-  ncclUniqueId commId;
-  int myrank;
-  int virtualId;
-};
-struct ncclCollArgs {
-  ncclComm_t comm;
-  uint16_t connIndex;
-};
-
-enum ncclAsyncFuncType {
-  ASYNC_FUNC_INVALID = 0,
-  ASYNC_FUNC_INIT = 1,
-  ASYNC_FUNC_COLL = 2,
-};
-struct ncclAsyncArgs {
-  ncclResult_t ret;
-  enum ncclAsyncFuncType funcType;
-  union {
-    ncclCollArgs coll;
-    ncclInitArgs init;
-  };
-};
-
-thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-
-void* ncclAsyncThreadMain(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank,
-				  args->init.cudaDev, args->init.virtualId));
-  return args;
-}
-
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId) {
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
+ncclResult_t ncclAsyncLaunch(
+    struct ncclAsyncJob* job,
+    ncclResult_t(*func)(struct ncclAsyncJob*),
+    void(*undo)(struct ncclAsyncJob*),
+    void(*destructor)(void*)
+  ) {
+  if (0 == ncclGroupDepth) {
+    ncclResult_t res = func(job);
+    if (res != ncclSuccess && undo) undo(job);
+    if (destructor) destructor(job);
+    return res;
+  } else {
+    job->func = func;
+    job->undo = undo;
+    job->destructor = destructor;
+    ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
+    return ncclSuccess;
  }
-  int index = ncclGroupIndex++;
-  struct ncclAsyncArgs* args = ncclGroupArgs+index;
-  args->funcType = ASYNC_FUNC_INIT;
-  args->init.func = func;
-  args->init.cudaDev = cudaDev;
-  args->init.newcomm = newcomm;
-  args->init.ndev = ndev;
-  memcpy(&args->init.commId, &commId, sizeof(commId));
-  args->init.myrank = myrank;
-  args->init.virtualId = virtualId;
-  return ncclSuccess;
 }

-ncclResult_t ncclAsyncColl(ncclComm_t comm) {
-  struct ncclAsyncArgs* args = ncclGroupArgs;
-  for (int i=0; i<ncclGroupIndex; i++) {
-    if (args->coll.comm == comm) return ncclSuccess;
-    args++;
+void* ncclAsyncJobMain(void* arg) {
+  struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg;
+  job->result = job->func(job);
+  if (job->result != ncclSuccess) {
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
  }
-  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
-    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInvalidUsage);
-  }
-  ncclGroupIndex++;
-  args->funcType = ASYNC_FUNC_COLL;
-  args->coll.comm = comm;
-  return ncclSuccess;
+  return arg;
 }

 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
-    memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
-  }
-  ncclGroupMode++;
+  NCCLCHECK(ncclGroupStartInternal());
+  TRACE_CALL("ncclGroupStart()");
  return ncclSuccess;
 }

-static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
-  struct ncclInfo info = { ncclFuncSend, "Send",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  info.channelId = channelId;
-  info.opCount = opCount;
-  info.connIndex = connIndex;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-
-static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
-  struct ncclInfo info = { ncclFuncRecv, "Recv",
-    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
-    1, 1 };
-  info.channelId = channelId;
-  info.opCount = opCount;
-  info.connIndex = connIndex;
-  NCCLCHECK(ncclSetupP2pKernel(&info));
-  return ncclSuccess;
-}
-
-void* ncclAsyncThreadPreconnect(void* args_) {
-  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  struct ncclComm* comm = args->coll.comm;
-  CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
-  return args;
-}
-
-static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
-  size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
-  int nChannels = minChannels;
-  while (size > maxSize && nChannels <= maxChannels/2) {
-    nChannels *= 2;
-    size = DIVUP(totalSize, nChannels);
-  }
-  ALIGN_SIZE(size, minSize);
-  return size;
-}
-
-RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
-
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  if (ncclGroupMode == 0) {
+  NCCLCHECK(ncclGroupEndInternal());
+  TRACE_CALL("ncclGroupEnd()");
+  return ncclSuccess;
+}
+
+struct ncclPreconnectJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+};
+ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  CUDACHECK(hipSetDevice(comm->cudaDev));
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
+  if (comm->p2pNet) NCCLCHECK(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P_NET));
+  return ncclSuccess;
+}
+
+static ncclResult_t doLaunches(struct ncclComm* head) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclComm* cliqueComm0 = head->intraComm0;
+  struct ncclComm* cliqueHead = head;
+  struct ncclComm* cliqueNextHead;
+  bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
+  // This outer loop iterates over cliques of comms which are siblings of the
+  // same global entity. We calculate a clique as all comms which have the same
+  // `intraComm0` value.
+  do {
+    struct ncclComm* comm = cliqueHead;
+    bool capturingYes = false, capturingNo = false;
+    do {
+      (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
+      CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
+      NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
+      if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
+      comm = comm->groupNext;
+    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    cliqueNextHead = comm;
+
+    if (capturingYes && capturingNo) {
+      // We have entered barriers but are aborting without leaving them. Thus
+      // these comms are permanently trashed. We need a good mechanism for
+      // tracking and reporting that.
+      WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured.");
+      result = ncclInvalidUsage;
+      goto failure;
+    }
+
+    while (true) { // Iterate rounds of launches for clique.
+      bool moreRounds;
+      comm = cliqueHead;
+      do { // Iterate clique members.
+        struct ncclComm* next = comm->groupNext;
+        if (useBarrier) {
+          // Barrier reduction result tells us if this was the final round.
+          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
+        } else {
+          moreRounds = comm->unlaunchedPlansHead != nullptr;
+        }
+        if (moreRounds) {
+          // Pop next unlaunched kernel
+          struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
+          if (plan != nullptr) {
+            comm->unlaunchedPlansHead = plan->next;
+            CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
+            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+          }
+          // Barrier reduction input indicates if we require further rounds.
+          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
+          if (plan != nullptr) {
+            NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
+          }
+        } else { // Final round.
+          CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
+          NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
+        }
+        comm = next;
+      } while (comm != cliqueNextHead);
+      if (!moreRounds) break;
+    }
+    cliqueHead = cliqueNextHead;
+  } while (cliqueHead != nullptr);
+failure:
+  return result;
+}
+
+ncclResult_t ncclGroupEndInternal() {
+  if (ncclGroupDepth == 0) {
    WARN("ncclGroupEnd: not in a group call.");
    return ncclInvalidUsage;
  }
-  ncclGroupMode--;
-  if (ncclGroupMode > 0) return ncclSuccess;
+  ncclGroupDepth--;
+  if (ncclGroupDepth > 0) return ncclSuccess;
+
  int savedDev;
  CUDACHECK(hipGetDevice(&savedDev));
-  int activeThreads = 0;
-  int doneArray[MAX_ASYNC_OPS];
-  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
+
  ncclResult_t ret = ncclGroupError;
-  int usingCudaGraphAll = -1;
-  hipGraph_t* graphs = NULL;
-  if (ret != ncclSuccess) goto group_cleanup;
+  bool jobsDone = false;
+  if (ret != ncclSuccess) goto failure;

-  /* Launch async ncclCommInitRank */
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_INIT) {
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
-      activeThreads++;
-      doneArray[i] = 0;
-    }
-  }
-  /* For init, since we use threads, we just wait for threads to complete */
-  while (activeThreads) {
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
-        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
-        if (err == EBUSY) continue;
-        if (err != 0) ret = ncclSystemError;
-        if (args->ret != ncclSuccess) ret = args->ret;
-        doneArray[i] = 1;
-        activeThreads--;
-      }
-    }
+  if (ncclGroupCommPreconnectHead != nullptr) {
+    struct ncclComm* comm = ncclGroupCommPreconnectHead;
+    do {
+      struct ncclPreconnectJob* job;
+      NCCLCHECK(ncclCalloc(&job, 1));
+      job->base.func = ncclPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->comm = comm;
+      ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
+
+      struct ncclComm* next = comm->preconnectNext;
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      comm = next;
+    } while (comm != nullptr);
  }

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
-      args->coll.connIndex = 1;
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-    }
-  }
+  if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
+      job = job->next;
+    } while (job != nullptr);

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
-      int err = pthread_join(ncclGroupThreads[i], NULL);
+    job = ncclIntruQueueHead(&ncclAsyncJobs);
+    do {
+      int err = pthread_join(job->thread, nullptr);
      if (err != 0) {
        WARN("Error waiting for pthread_join : %s", strerror(errno));
-        return ncclSystemError;
+        ret = ncclSystemError;
      }
-      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
-      NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect[1] = 0;
-    }
+      if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
+      job = job->next;
+    } while (job != nullptr);
+
+    jobsDone = true;
+    if (ret != ncclSuccess) goto failure;
  }

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
-      args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
-      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
-    }
+  if (ncclGroupCommHead != nullptr) {
+    NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
+    do {
+      struct ncclComm* comm = ncclGroupCommHead;
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm);
+      ncclGroupCommHead = next;
+    } while (ncclGroupCommHead != nullptr);
  }

-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
-      int err = pthread_join(ncclGroupThreads[i], NULL);
-      if (err != 0) {
-        WARN("Error waiting for pthread_join : %s", strerror(errno));
-        return ncclSystemError;
+  if (false) {
+  failure:
+    struct ncclComm* comm = ncclGroupCommHead;
+    while (comm != nullptr) {
+      struct ncclComm* next = comm->groupNext;
+      ncclGroupCommLeave(comm); // overwrites comm->groupNext
+      // We don't know if preconnect succeeded or happened at all, so clear
+      // the flags that let `taskAppend()` skip over checking if preconnect
+      // is needed.
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      for (int i=0; i < comm->nRanks; i++) {
+        comm->tasks.peers[i].sendSeen = false;
+        comm->tasks.peers[i].recvSeen = false;
+        comm->connectSend[i] = 0;
+        comm->connectRecv[i] = 0;
+        comm->connectSend[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
+        comm->connectRecv[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
      }
-      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
-      NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
-    }
-  }
-
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      struct ncclComm* comm = args->coll.comm;
-      int node = comm->node;
-      int nNodes = comm->nNodes;
-      int localRank = comm->localRank;
-
-      // Compute how much to split operations
-      // Natural step size matching buffer steps.
-      ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
-      // Try to use all channels
-      int nChannelsMax = comm->p2pnChannelsPerPeer;
-      int nChannelsMin = nChannelsMax;
-      // Try to use all channels, but one channel per operation.
-      //while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
-      // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
-      //while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
-
-      while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-        // schedule delta 0, +1, -1, +2, -2, ...
-        // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
-        for (int d=0; d<=nNodes/4; d++) {
-          int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
-          int index = 0;
-          int delta = deltas[index];
-sched_delta:
-          uint32_t recvNode = (node+nNodes-delta)%nNodes;
-          uint32_t sendNode = (node+delta)%nNodes;
-          int steps = comm->maxLocalRanks;
-          for (int s=0; s<steps; s++) {
-            int recvIndex = (localRank-s+steps)%steps;
-            int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
-            int sendIndex = (localRank+s)%steps;
-            int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
-            struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
-            struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
-            if (recv != NULL || send != NULL) {
-              ssize_t totRecvBytes = -1, totSendBytes = -1;
-              if (recv != NULL) totRecvBytes = recv->nbytes;
-              if (send != NULL) totSendBytes = send->nbytes;
-              if (recv) comm->p2pRecvCount--;
-              if (send) comm->p2pSendCount--;
-              if (recvPeer == comm->rank) { // Check self send/recv
-                if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
-                if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
-                if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
-              }
-              void* recvBuff = recv ? recv->buff : NULL;
-              void* sendBuff = send ? send->buff : NULL;
-              // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
-              if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
-              if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
-
-              ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-              ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-
-              uint16_t sendIdx = 1, recvIdx = 1;
-              if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
-                sendIdx = NCCL_CONN_IDX_P2P_NET;
-              if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
-                recvIdx = NCCL_CONN_IDX_P2P_NET;
-
-              ssize_t sendOffset = 0;
-              ssize_t recvOffset = 0;
-              int sendRemaining = 1, recvRemaining = 1;
-              int chunk = 0;
-              do {
-                int channelId;
-                // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
-                // to use multiple channels to guarantee progress on all ranks from the same node.
-                ssize_t recvbytes = totRecvBytes-recvOffset;
-                ssize_t sendbytes = totSendBytes-sendOffset;
-                if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
-                if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
-                // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
-                // (total size == 0), otherwise set size to -1.
-                if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
-                if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
-                if (send || recv) {
-                  if (recv) {
-                    NCCLCHECK(ncclChannelCompute(comm, recvPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
-                  }
-                  else
-                    recvPeer = -1;
-                  if (send) {
-                    NCCLCHECK(ncclChannelCompute(comm, sendPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
-                  }
-                  else
-                    sendPeer = -1;
-                  NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, recv ? ((char*)recvBuff)+recvOffset : NULL, recv ? recv->opCount : 0, recvIdx), ret, group_cleanup);
-                  NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, send ? ((char*)sendBuff)+sendOffset : NULL, send ? send->opCount : 0, sendIdx), ret, group_cleanup);
-                }
-                recvOffset += recvChunkSize;
-                sendOffset += sendChunkSize;
-                chunk++;
-              } while (sendRemaining || recvRemaining);
+      comm->unlaunchedPlansHead = nullptr;
+      // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
+      // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
+      while (!ncclIntruQueueEmpty(&comm->planQueue)) {
+        struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+        // Persistent plans will be reclaimed via the callbackQueue when the
+        // graph drops its UserObject reference.
+        if (!plan->persistent) {
+          for (int c=0; c < MAXCHANNELS; c++) {
+            while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
+              struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
+              ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
            }
          }
-          index++;
-          if (index == 1 && deltas[1] == deltas[0]) index++;
-          if (index == 2 && deltas[2] == deltas[0]) index++;
-          if (index == 3 && deltas[3] == deltas[2]) index++;
-          if (index == 3 && deltas[3] == deltas[1]) index++;
-          if (index < 4) {
-            delta = deltas[index];
-            goto sched_delta;
-          }
+          ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
        }
      }
+      // Reset comm->tasks to empty.
+      comm->tasks.nTasksColl = 0;
+      comm->tasks.nTasksP2p = 0;
+      comm->tasks.streams = nullptr;
+      ncclIntruQueueConstruct(&comm->tasks.collQueue);
+      comm->tasks.collBytesTotal = 0;
+      for (int i=0; i < comm->nRanks; i++) {
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
+        ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+      }
+      comm = next;
    }
  }

-  /* Collectives are done in three steps :
-   * 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
-   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
-   * 2. Barrier Wait. No CUDA call is permitted
-   * 3. Enqueue Events. CUDA event wait/enqueue.
-   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * cudaFree happens between 1 and 3, it could block that CUDA call and
-   * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the cudaFree call.
-   */
-
-  // Check whether we are in cuda graph mode
-  NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
-      if (usingCudaGraphAll == -1) {
-        usingCudaGraphAll = comm->usingCudaGraph;
-      } else if (usingCudaGraphAll != comm->usingCudaGraph) {
-        WARN("Illegal to have some communicators in graph mode while others not");
-        ret = ncclInvalidUsage;
-        goto group_cleanup;
-      }
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      ncclComm_t comm = args->coll.comm;
-      NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == hipStreamDefault/* ||
-          args->coll.comm->userStream == hipStreamPerThread ||
-          args->coll.comm->userStream == hipStreamLegacy*/)
-        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
-      if (usingCudaGraphAll == 1) {
-        NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
-      } else {
-        ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
-      }
-      NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
-    }
-  }
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL) {
-      if (args->coll.comm->userStream == hipStreamDefault/* ||
-          args->coll.comm->userStream == hipStreamPerThread ||
-          args->coll.comm->userStream == hipStreamLegacy*/)
-        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
-      NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
-      NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
-    }
+  while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
+    if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
+    if (job->destructor) job->destructor((void*)job);
  }

-  goto end;
-group_cleanup:
-  if (ret != ncclSuccess) {
-    // At least one call in the group failed. Since we want to make that group
-    // an atomic operation, we need to cancel all operations.
-    for (int i=0; i<ncclGroupIndex; i++) {
-      struct ncclAsyncArgs* args = ncclGroupArgs+i;
-      if (args->funcType == ASYNC_FUNC_INIT) {
-        if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
-        *args->init.newcomm = NULL;
-      } else {
-        struct ncclComm* comm = args->coll.comm;
-        // Reset aggregation counters
-        comm->asyncOpCount = 0;
-        comm->asyncTotalSize = 0;
-        // Dequeue p2p lists
-        if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
-          for (int peer=0; peer<comm->nRanks; peer++) {
-            if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle();
-            if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle();
-          }
-          comm->p2pSendCount = comm->p2pRecvCount = 0;
-        }
-        ncclLaunchReset(comm);
-      }
-    }
-  }
-end:
  ncclGroupError = ncclSuccess;
-  ncclGroupIndex = 0;
+  ncclGroupCommHead = nullptr;
+  ncclGroupCommPreconnectHead = nullptr;
  CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
-  if (graphs) free(graphs);
  return ret;
 }
@@ -11,28 +11,40 @@
 #include "nccl.h"
 #include "checks.h"
 #include "align.h"
+#include "utils.h"
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
 #include "rccl_vars.h"

+uint64_t clockNano(); // from utils.h with which we have a circular dependency
+
 template <typename T>
-static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  CUDACHECK(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped));
+ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  CUDACHECKGOTO(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped), result, finish);
+  time = clockNano() - time;
  memset(*ptr, 0, nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return ncclSuccess;
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: hipHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+inline ncclResult_t ncclCudaHostFree(void* ptr) {
  CUDACHECK(hipHostFree(ptr));
  return ncclSuccess;
 }

 template <typename T>
-static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  void* p = malloc(nelem*sizeof(T));
  if (p == NULL) {
    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
@@ -46,7 +58,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
 #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

 template <typename T>
-static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
  if (nelem < oldNelem) return ncclInternalError;
  if (nelem == oldNelem) return ncclSuccess;

@@ -78,54 +90,126 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
 extern struct allocationTracker allocTracker[];

 template <typename T>
-static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
-
-  // Need async stream for P2P pre-connect + CUDA Graph
-  static bool streamCreated = false;
-  static hipStream_t stream;
-  if (rcclParamEnableHipGraph() && !streamCreated)
-  {
-    // Create stream only once to avoid performance penalty
-    CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-    streamCreated = true;
-  }
-
+ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
+  ncclResult_t result = ncclSuccess;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  uint64_t time = clockNano();
  if (isFineGrain)
-    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
  else
-    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+    CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+  return result;
+}
+#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)

-  if (rcclParamEnableHipGraph()) {
-    CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
-    CUDACHECK(hipStreamSynchronize(stream));
-    // NOTE: Currently the re-used stream is not destroyed
-    //CUDACHECK(hipStreamDestroy(stream));
-  } else {
-    CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
-    CUDACHECK(hipStreamSynchronize(NULL));
-  }
-
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+template <typename T>
+ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time0=0, time1=0, time2=0;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  hipStream_t stream;
+  time0 = clockNano();
+  CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+  time1 = clockNano();
+  if (isFineGrain)
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
+  else
+    CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time2 = clockNano();
+  CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
  int dev;
  CUDACHECK(hipGetDevice(&dev));
  if (dev < MAX_ALLOC_TRACK_NGPU) {
-    __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
-    __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
+    __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
+    __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
  }
-  return ncclSuccess;
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipStreamCreateWithFlags=%g hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)

 template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
-  return ncclSuccess;
+ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
+  ncclResult_t result = ncclSuccess;
+  uint64_t time = 0;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  time = clockNano();
+  if (isFineGrain)
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
+  else
+    CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
+  time = clockNano() - time;
+  CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  int dev;
+  CUDACHECK(hipGetDevice(&dev));
+  if (dev < MAX_ALLOC_TRACK_NGPU) {
+    __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
+    __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
+  }
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
+
+template <typename T>
+ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  ncclResult_t result = ncclSuccess;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  hipStream_t stream;
+  CUDACHECKGOTO(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking), result, finish);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
+  CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
+  ncclResult_t result = ncclSuccess;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(hipMemcpyAsync(dst, src, nelem*sizeof(T), hipMemcpyDefault, stream), result, finish);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaFree(T* ptr) {
+  ncclResult_t result = ncclSuccess;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(hipFree(ptr), result, finish);
+finish:
+  CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  return result;
 }

 // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
+inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
  size_t page_size = sysconf(_SC_PAGESIZE);
  void* p;
  int size_aligned = ROUNDUP(size, page_size);
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int
 }

 static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
  return ncclSuccess;
 }

@@ -10,7 +10,7 @@

 #include "debug.h"

-// Check CUDA calls
+// Check CUDA RT calls
 #define CUDACHECK(cmd) do {                                 \
    hipError_t err = cmd;                                    \
    if( err != hipSuccess ) {                                \
@@ -143,9 +143,9 @@
  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
 } while (!(cond));

-#define NCCLCHECKTHREAD(a) do { \
-  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+#define NCCLCHECKTHREAD(a, args) do { \
+  if (((args)->ret = (a)) != ncclSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
    return args; \
  } \
 } while(0)
@@ -10,25 +10,26 @@
 #include "nccl.h"
 #include "nccl_net.h"

-extern ncclCollNet_t* ncclCollNet;
 typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

 // Translation to external API
-static const char* collNetName() { return ncclCollNet->name; }
-static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
-static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
-static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
-  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
-static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
+static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }

-static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
+static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }

 #endif
@@ -47,10 +47,10 @@ struct ncclDevRedOpFull {
 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
-  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
-  extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
-  extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);

 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
@@ -10,25 +10,13 @@

 #include "transport.h"
 #include "p2p.h"
-// [RCCL]
-//#include "clique/CliqueManager.h"
-// [/RCCL]
-
-// Convert volatile access to atomic
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
-#else
-#define LOAD(VAR) *(VAR)
-#define STORE(DST, SRC) *(DST) = (SRC)
-#endif
-
-
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  #define HIPRT_CB
-#else
 #include "collectives.h"
+#include "proxy.h"
+#include "strongstream.h"

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define HIPRT_CB
+#else
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
  void *func;
@@ -77,8 +65,6 @@ struct ncclRecvMem {
  };
 };

-typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
-
 enum helperThreadState {ThreadStart, ThreadStop};

 #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
@@ -104,15 +90,87 @@ struct ncclNodeRanks {
  int* localRankToRank;
 };

-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
+struct ncclDestructor {
+  struct ncclDestructor* next;
+  void* obj;
+  ncclResult_t(*fn)(struct ncclDestructor* me);
+};

+struct ncclCommCallback {
+  struct ncclCommCallback* next;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
+};
+
+struct ncclChannel {
+  struct ncclChannelPeer* peers;
+  struct ncclDevChannelPeer* devPeers;
+  struct ncclRing ring;
+  int* devRingUserRanks;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  int id; // index of this channel
+  uint32_t workFifoSent; // last used work index+1
+  uint64_t p2pOpCount;
+};
+
+struct ncclWorkList {
+  struct ncclWorkList* next;
+  struct ncclWork work;
+};
+
+struct ncclPointerList {
+  struct ncclPointerList* next;
+  void *ptr;
+};
+
+struct ncclKernelPlan {
+  // A kernel plan is also a callback that reclaims itself. Hence this must
+  // be the first member.
+  struct ncclCommCallback reclaimer;
+  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
+
+  struct ncclComm* comm;
+  struct ncclKernelPlan* next;
+
+  bool persistent; // aka captured in a graph
+  void *kernelFn;
+  int channelUbound; // only channels c < channelUbound are present
+  int channelCount; // number of channels present
+  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
+  int threadPerBlock;
+  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
+  struct ncclWork* workHead;
+
+  int collOpCount; // zero based for this plan
+
+  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+
+  struct Channel {
+    int nWork;
+    union {
+      int nWorkElem; // used for coll and reg coll
+      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
+    };
+    size_t collBytes;
+    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+  } channels[MAXCHANNELS];
+};
+
+struct ncclComm {
+  struct ncclMemoryStack memPermanent, memScoped;
+  // List of destructors to run when comm is destructed
+  struct ncclDestructor* destructorHead;
+
+  struct ncclChannel channels[MAXCHANNELS];
  struct ncclPeerInfo* peerInfo;
  struct ncclTopoSystem* topo;

+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
  void* bootstrap;
  // Bitmasks for ncclTransportP2pSetup
-  int connect[NCCL_MAX_CONNS];
  uint32_t* connectSend;
  uint32_t* connectRecv;

@@ -135,19 +193,13 @@ struct ncclComm {
  // localRanks and localRanktoRank for all nodes
  struct ncclNodeRanks* nodeRanks;

-  enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
-  hipStream_t userStream;
-  bool userStreamSet;
-  hipEvent_t doneEvent;
-  hipEvent_t intDoneEvent;
  bool checkPointers;
+  bool dmaBufSupport;

  // Counter for tracking CUDA launches (P2P and collectives included)
  uint64_t opCount;
  // Collective operation counter
  uint64_t collOpCount;
-  // P2P operation counter
-  uint64_t p2pOpCount;

  // Channels for collectives
  int nChannels;
@@ -165,10 +217,6 @@ struct ncclComm {
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  hipStream_t groupStream;
-
  // Whether there has been a fatal error in this communicator.
  ncclResult_t fatalError;

@@ -178,26 +226,33 @@ struct ncclComm {
  // Flags for enable P2P NET
  uint32_t p2pNet;
  uint32_t useIntraNet;
+  bool hasFineGrain;

-  // Device side of the communicator
-  struct ncclDevComm *devComm;
-  // Host copy of the devComm (to free CUDA allocs)
-  struct ncclDevComm hostDevComm;
+  // Device side of the communicator (for cudaFree's)
+  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+
+  // Operation pool.
+  int workFifoDepth; // size of workFifoHeap[], power of 2
+  struct ncclWork* workFifoHeap;
+  struct ncclWork* devWorkFifoHeap;
+  void* workFifoHeapGdrHandle;
+
+  // Work completion notificaion
+  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
+  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
+  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.

  // Intra-process sync
+  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
+  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
+  int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
  int intraRank;
  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  hipLaunchParams * intraParams;
-  hipLaunchParams *myParams;
-  pthread_t* intraThreads;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  void* argsptrs[1];
+  uint32_t intraBarrierPhase;
+  char intraPad1[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierCounter; // only used if this is intraComm0
+  char intraPad2[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierGate; // only used if this is intraComm0

  struct ncclProxyState proxyState;

@@ -205,44 +260,108 @@ struct ncclComm {
  int collNetSupport;
  int intraHighestTransportType;

-  // Store info of async operations
-  struct ncclInfo* asyncOps;
-  int asyncOpCount;
-  size_t asyncTotalSize;
-  ssize_t channelSize;
-  int lastChannel;
-  enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
+  size_t channelSize; // User requested work size (bytes) for channel partitions

-  //list of async p2p operation queued in a group semantics
-  ncclP2Plist** p2pSends;
-  ncclP2Plist** p2pRecvs;
-  int p2pSendCount;
-  int p2pRecvCount;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;

-  // [RCCL]
-  //CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
-  //int rootPid;                     // Process ID of root
-  // [/RCCL]
-
-  // Store info for cudaGraph
-  int usingCudaGraph; // Only use it during capture time, not launch time
-  struct ncclQueueInfo* enqueueInfo;
-  int nQueueInfoCreated;
-  int nQueueInfoDestroyed;
-  hipGraphNode_t lastSetupNode;
-  unsigned long long lastCudaGraphId;
-  int driverVersion;
-  pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
-  pthread_t graphHelperThread;
-  struct ncclGraphHelperResources* graphHelperResources;
-  int disableGraphHelper;
-  int graphRegister;
+  // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclProxyOp;
+  struct ncclMemoryPool memPool_ncclKernelPlan;
+  struct ncclMemoryPool memPool_ncclPointerList;
+  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
+  // this comm is not yet in a group.
+  struct ncclComm* groupNext;
+  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
+  struct ncclComm* preconnectNext;
+  int persistentRefs; // number of persistent plan-lists capturing this comm
+  struct ncclTasks tasks;

  // user-created reduction ops
  int userRedOpCapacity, userRedOpFreeHead;
  ncclUserRedOp *userRedOps;
+
+  // Queue of things for the main thread to do
+  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
+
+  hipEvent_t doneEvent;
+  hipStream_t lastStream;
+
+#ifdef ENABLE_COLLTRACE
+  struct ncclCollTrace* collTrace;
+  volatile uint32_t *collTraceTail;
+  pthread_t collTraceThread;
+  volatile bool collTraceExit;
+#endif
 };

+// Set to true during an `atexit()` handler. We use this to intentionally leak
+// unfreed CUDA resources when cleaning up after return of `main()` to avoid
+// CUDA calls after CUDA runtime teardown.
+extern bool ncclMainExited;
+
+enum ncclLaunchMode {
+  ncclLaunchModeInvalid=0,
+  ncclLaunchModeParallel,
+  ncclLaunchModeGroup
+};
+extern enum ncclLaunchMode ncclParamLaunchMode;
+
+void ncclCommPushFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
+
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
+  while (cb != nullptr) {
+    struct ncclCommCallback* next = cb->next;
+    NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
+    cb = next;
+  }
+  return ncclSuccess;
+}
+
+inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
+  int phase = comm->intraBarrierPhase;
+  if (comm->intraRanks == 1) {
+    // Release everyone (just me).
+    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
+  } else {
+    struct ncclComm* comm0 = comm->intraComm0;
+    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
+    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
+      // Reset.
+      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
+      // Release everyone.
+      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
+    }
+  }
+}
+
+// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
+inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
+  struct ncclComm* comm0 = comm->intraComm0;
+  comm->intraBarrierPhase ^= 1;
+  uint32_t phase = comm->intraBarrierPhase;
+  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+  if ((gate & 1) != phase) {
+    uint64_t t0 = clockNano();
+    do {
+      // Spin vigorously for first 5us.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+    } while ((gate & 1) != phase);
+  }
+  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
+  return gate>>32;
+}
+
 // Scrambles the bits of non-builtin values of ncclRedOp_t according to the
 // communicator memory address. Used to catch bugs so that integer handles
 // associated with this communicator won't collide with handles of other
@@ -37,7 +37,9 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
    case ncclUint8:
      return 1;
    case ncclFloat16:
+#if defined(RCCL_BFLOAT16)
    case ncclBfloat16:
+#endif
      return 2;
    case ncclInt32:
    case ncclUint32:
@@ -54,6 +56,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {

 #include "debug.h"
 #include "checks.h"
+#include "rocmwrap.h"
 #include "alloc.h"
 #include "utils.h"
 #include "param.h"
@@ -0,0 +1,88 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CUDAWRAP_H_
+#define NCCL_CUDAWRAP_H_
+
+#include <cuda.h>
+
+#if CUDART_VERSION >= 11030
+#include <cudaTypedefs.h>
+#else
+typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+#endif
+
+#define CUPFN(symbol) pfn_##symbol
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    CUresult err = pfn_##cmd;						\
+    if( err != CUDA_SUCCESS ) {						\
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+    }									\
+} while(false)
+
+#define CUCHECKTHREAD(cmd, args) do {					\
+    CUresult err = pfn_##cmd;						\
+    if (err != CUDA_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+
+#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN_EXTERN(cuInit);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
+
+
+ncclResult_t cudaLibraryInit(void);
+
+#endif
@@ -10,8 +10,8 @@
 #include "nccl_net.h"
 #include <stdio.h>
 #include <chrono>
+#include <type_traits>

-#include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
 #include <pthread.h>
@@ -21,7 +21,7 @@

 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugOutputLock;
+extern pthread_mutex_t ncclDebugLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);

@@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file

 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
+extern char ncclLastError[];

 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)

 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+extern std::chrono::steady_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
@@ -15,9 +15,6 @@
 #include "npkit/npkit_struct.h"
 #endif
 #include <stdint.h>
-// [RCCL] Support for clique-based kernels
-//#include "clique/CliqueCommon.h"
-// [/RCCL]


 #define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
@@ -33,7 +30,6 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_LL 0
 #define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_CLIQUE 1  // [RCCL] Clique takes up same protocol as unused LL128
 #define NCCL_PROTO_SIMPLE 2
 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];

@@ -83,10 +79,6 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_MAX_NTHREADS 256
 #define NCCL_LL128_ELEMS_PER_THREAD 28

-// Receiving from up to 3 sources is more compute intensive than sending
-// to 3 dests. Use 70% for reduce and 30% for bcast.
-#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
-
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

@@ -145,7 +137,6 @@ struct ncclRing {
  // since we need to know how the user expects data to be ordered across
  // devices. Ordered from current device.
  int* userRanks;
-  int* devUserRanks;

  int index; // This rank's index in the ring
 };
@@ -171,7 +162,7 @@ struct ncclDirect {

 #define NCCL_CONN_IDX_P2P_NET 2
 #define NCCL_MAX_CONNS 3
-struct ncclPeer {
+struct ncclChannelPeer {
  struct ncclConnector send[NCCL_MAX_CONNS];
  struct ncclConnector recv[NCCL_MAX_CONNS];
 };
@@ -185,31 +176,43 @@ struct ncclDevComm;
 /* Make sure to adjust padding at the end of ncclWorkElem. */
 #define NCCL_WORK_SIZE 256

-enum ncclWorkElemType : uint8_t {
+enum ncclWorkType : uint8_t {
   ncclWorkTypeUnused=0,
   ncclWorkTypeColl=1,
   ncclWorkTypeP2p=2,
   ncclWorkTypeRegColl=3
 };
-enum ncclWorkElemSubType : uint8_t {
-  ncclWorkSubTypeUnused =0,
-  ncclWorkSubTypeSend,
-  ncclWorkSubTypeRecv
+enum ncclWorkP2PType : uint8_t {
+  ncclWorkP2pTypeUnused=0,
+  ncclWorkP2pTypeSend,
+  ncclWorkP2pTypeRecv
 };

-struct ncclWorkElemHeader {
+struct ncclWorkHeader {
+  union {
+    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+  };
  uint16_t funcIndex;
-  enum ncclWorkElemType type;
-  uint8_t nWarps:5;
-  uint8_t isLast:1;
+  uint8_t isLast:1; // last work for this kernel
+  uint8_t inFifo:1; // is this work in the fifo
+  enum ncclWorkType type;
 };

 struct ncclWorkElem {
-  struct ncclWorkElemHeader header;
-  uint8_t regUsed;
+  union {
+    uint8_t flagBits;
+    struct {
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, pad_0:1, nWarps:4;
+    };
+  };
  uint8_t direct;
-  uint8_t redOpArgIsPtr;
-  uint8_t pad_0;
+  uint8_t bid;
+  uint8_t nChannels;
+  struct {
+    uint32_t root:30;
+    uint32_t connIndex:2;
+  };

  const void * sendbuff;
  void * recvbuff;
@@ -221,29 +224,40 @@ struct ncclWorkElem {
    // Instead, it needs the number of bidirectional rings.
    size_t pivotA2ANumBiRings;
  };
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nChannels;
-  uint16_t connIndex;
  uint64_t redOpArg;
  uint64_t opCount;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
+
+static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
+#define NCCL_MAX_WORK_ELEMENTS 1

 struct ncclWorkElemP2p {
-  struct ncclWorkElemHeader header;
-  int32_t peer;
-  void* buff;
-  size_t count;
+  struct {
+    int32_t peer:30;
+    uint32_t connIndex:2;
+  };
+  union {
+    uint16_t flagBits;
+    struct {
+      enum ncclWorkP2PType p2pType:4;
+      uint16_t nWarps:4;
+      uint16_t warpStart:4;
+      uint16_t ngroups:4;
+    };
+  };
+  uint16_t opCount;
+  // Important not to use any fields with greater than 4-byte alignment since
+  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
+  // there were 8-byte fields.
+  //void* buff;
+  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+  //size_t count;
+  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
  int chunkSize;
-  uint8_t ngroups:4;
-  uint8_t warpStart:4;
-  uint8_t nWarps:4;
-  enum ncclWorkElemSubType subType:4;
-  uint16_t opCount:12;
-  uint16_t connIndex:4;
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
+
+static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
+#define NCCL_MAX_WORK_ELEMENTS_P2P 2

 struct ncclWorkElemReg {
  struct ncclWorkElem elem;
@@ -251,56 +265,31 @@ struct ncclWorkElemReg {
  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
-static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
-static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");

-#define NCCL_MAX_WORK_ELEMENTS 1
-#define NCCL_MAX_WORK_ELEMENTS_P2P 2
-#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
+static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
+
 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)

 struct ncclWork {
+  struct ncclWorkHeader header;
  union {
-    char pad[NCCL_WORK_SIZE];
-    struct ncclWorkElemHeader header;
+    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
  };
 };
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
+static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");

-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-      struct ncclDirect collTree;
-
-      int id;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclWork* workFifo;
-      int workCount;
-      size_t totalSize;
-      uint64_t workFifoTail; // Only used by CPU
-      uint16_t index;        // Only used by GPU
-
-      // GDRCOPY support
-      struct ncclWork* workFifoGdr;
-      struct ncclWork* workFifoDev;
-      void* gdrMemDesc;
-    };
-    int data[0x80];
-  };
+struct ncclDevChannelPeer {
+  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
+  // instead of the full ncclConnector.
+  struct ncclConnInfo send[NCCL_MAX_CONNS];
+  struct ncclConnInfo recv[NCCL_MAX_CONNS];
 };
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
 #pragma pack(pop)   /* restore original alignment from stack */

 #ifdef ENABLE_PROFILING
@@ -361,38 +350,48 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
 #define COLLTRACE_NUM_ITEMS 8192
 #endif

+struct alignas(16) ncclDevChannel {
+  struct ncclDevChannelPeer *peers;
+  struct ncclRing ring;
+  struct ncclTree tree;
+  struct ncclDirect collTree;
+  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+};
+
 struct ncclDevComm {
  int rank;
  int nRanks;
  int buffSizes[NCCL_NUM_PROTOCOLS];

+  // Operation list for aggregation
+  int workFifoDepth;
+  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+
  // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;

  // Channels, device side
-  struct ncclChannel* channels;
+  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;

 #if defined(ENABLE_NPKIT)
  NpKitEventCollectContext* npKitEventCollectContexts;
  uint64_t* cpuTimestamp;
 #endif

-#ifdef ENABLE_PROFILING
-  // Profiling counters
-  struct ncclProf* devProf;
-#endif
-
 #ifdef ENABLE_COLLTRACE
  struct ncclCollTrace* collTrace;
-  uint32_t collTraceHead, *collTraceTail;
+  volatile uint32_t *collTraceTail;
  pthread_t collTraceThread;
-  bool collTraceExit;
+#endif
+
+#ifdef ENABLE_PROFILING
+  struct ncclProf* devProf;
 #endif
 };

-struct ncclDevCommAndChannels {
-  ncclDevComm comm;
-  ncclChannel channels[MAXCHANNELS];
+struct alignas(16) ncclDevCommAndChannels {
+  struct ncclDevComm comm;
+  struct ncclDevChannel channels[MAXCHANNELS];
 };

 #endif
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,6 +10,7 @@
 #include "comm.h"
 #include "group.h"
 #include "collectives.h"
+#include "utils.h"

 #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
@@ -19,117 +19,10 @@ size_t ncclKernMaxLocalSize();
 size_t ncclKernLocalSize(int i);
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
-ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
-ncclResult_t ncclLaunchKernel(ncclComm_t comm);
-ncclResult_t ncclRecordEvents(struct ncclComm* comm);
-ncclResult_t ncclLaunchReset(ncclComm_t comm);
-ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
-ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
-template<int USING_CUDA_GRAPH>
-void HIPRT_CB ncclEnqueueHostSetup(void* arg);
-ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph);
-ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph);
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm);

-struct ncclBuffRegInfo {
-  void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
-  void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
-  void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
-  int nBuffs;
-};
-
-// Enqueue information (for kernel and proxy) for each operation
-struct ncclQueueElem {
-  struct ncclWork work;
-  struct ncclProxyOp proxyOp;
-  struct ncclBuffRegInfo buffRegInfo;
-};
-
-typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
-
-// Structure passed to CUDA graph
-struct ncclQueueInfo {
-  ncclComm_t comm;
-  int maxChannels;    // Dynamic version of gridDim
-  ncclResult_t ret;   // Return value of host setup call
-  int nRegBuffs;
-  ncclQueueElemList* elemList;
-};
-
-static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
-  NCCLCHECK(ncclCalloc(eqInfo, 1));
-  (*eqInfo)->comm = comm;
-  (*eqInfo)->elemList = new ncclQueueElemList();
-  (*eqInfo)->comm->nQueueInfoCreated++;
-  return ncclSuccess;
-}
-
-// Reset element queue
-static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
-  if (eqInfo == NULL) return ncclInternalError;
-  eqInfo->maxChannels = 0;
-  eqInfo->ret = ncclSuccess;
-  eqInfo->nRegBuffs = 0;
-  eqInfo->elemList->recycle();
-  return ncclSuccess;
-}
-
-// Destroy enqueue info space
-// used by both CUDA graph and non CUDA graph
-static void ncclDestroyQueueInfo(void* ptr) {
-  if (ptr == NULL) return;
-  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
-  struct ncclComm* comm = eqInfo->comm;
-  // Close IPC mem handles for registered buffers
-  struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
-#if 0
-  // Ideally, the deregistration should happen here
-  // but currently the destroy function of CUDA objects does not allow CUDA API calls
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (i == eqInfo->comm->localRank) continue;
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-#else
-  // Instead, we push these pointers to a pool owned by ncclComm
-  // and asks a helper thread to close mem handles
-  struct ncclGraphHelperResources* res = comm->graphHelperResources;
-  int ipcTailOld = 0;
-  if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
-
-  pthread_mutex_lock(&res->threadLock);
-  ipcTailOld = res->ipcTail;
-  while (eqElem != NULL) {
-    for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-      if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
-        res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
-        res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
-      }
-    }
-    eqElem = eqInfo->elemList->getNext();
-  }
-  if (res->ipcTail != ipcTailOld) {
-    res->threadState = ThreadStart;
-    TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
-    pthread_cond_signal(&res->threadCond);
-  }
-  pthread_mutex_unlock(&res->threadLock);
-#endif
-
-skip:
-  delete eqInfo->elemList;
-  free(eqInfo);
-  comm->nQueueInfoDestroyed++;
-  return;
-}
 #endif // End include guard
@@ -263,7 +263,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
-  CUDACHECK(hipFree(md->gdrDevMem));
+  CUDACHECK(cudaFree(md->gdrDevMem));
  free(md);

  return ncclSuccess;
@@ -24,7 +24,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);

-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
@@ -37,7 +37,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int ne
 #define MAX_XGMI_INTER_GPUS 4
 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
 ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
-int ncclPxnDisable();
+int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);

@@ -11,15 +11,78 @@
 #include "nccl.h"
 #include "comm.h"

-bool ncclAsyncMode();
-ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
+void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommPreconnect(struct ncclComm* comm);
+void ncclGroupCommLeave(struct ncclComm* comm);

-typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
+struct ncclAsyncJob {
+  struct ncclAsyncJob* next;
+  pthread_t thread;
+  ncclResult_t result;
+  ncclResult_t(*func)(struct ncclAsyncJob*);
+  void(*undo)(struct ncclAsyncJob*);
+  void(*destructor)(void*);
+};

-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
+ncclResult_t ncclAsyncLaunch(
+  struct ncclAsyncJob* job,
+  ncclResult_t(*func)(struct ncclAsyncJob*),
+  void(*undo)(struct ncclAsyncJob*),
+  void(*destructor)(void*)
+);

-typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclGroupStartInternal();
+ncclResult_t ncclGroupEndInternal();
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
+extern __thread ncclResult_t ncclGroupError;
+extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+
+inline ncclResult_t ncclGroupStartInternal() {
+  ncclGroupDepth++;
+  return ncclSuccess;
+}
+
+inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
+  if (ncclGroupDepth > 0) {
+    if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+  }
+  return ret;
+}
+
+// Add comm to this thread's group
+inline void ncclGroupCommJoin(struct ncclComm* comm) {
+  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
+    // the users program order yet insures siblings occur consecutively. This
+    // is required by doLaunches() in "group.cc".
+    struct ncclComm** pp = &ncclGroupCommHead;
+    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
+      pp = &(*pp)->groupNext;
+    comm->groupNext = *pp;
+    *pp = comm;
+    // Comms gets a new memory stack scope upon joining. Each task batched for
+    // this comm is allocated there.
+    ncclMemoryStackPush(&comm->memScoped);
+  }
+}
+
+// Add comm to this thread's group needing preconnect
+inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
+  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    comm->preconnectNext = ncclGroupCommPreconnectHead;
+    ncclGroupCommPreconnectHead = comm;
+  }
+}
+
+// Comm has left group
+inline void ncclGroupCommLeave(struct ncclComm* comm) {
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  ncclMemoryStackPop(&comm->memScoped);
+}

-ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
@@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
 ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
@@ -11,6 +11,9 @@
 #include "nccl.h"
 #include "devcomm.h"
 #include "collectives.h"
+#include "core.h"
+#include "utils.h"
+#include "strongstream.h"

 typedef enum : uint8_t {
  ncclPatternRing,
@@ -53,8 +56,66 @@ struct ncclInfo {
  int nchunksPerLoop;
  int chunkSize;
  int channelId;
-  uint16_t connIndex;
-  uint64_t opCount;
+};
+
+inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+  return ncclSuccess;
+}
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclDevRedOpFull op;
+  int chunkSteps, sliceSteps;
+};
+struct ncclTaskP2p {
+  ncclTaskP2p *next;
+  void *buff;
+  size_t bytes;
+  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
+  // of where it left off.
+  int chunk;
+};
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  hipStream_t stream;
+};
+
+struct ncclTasks {
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
+  size_t collBytesTotal;
+  struct Peer* peers/*[nRanks]*/;
+  int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
+  int nTasksColl, nTasksP2p;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // Keep track of the number of user streams
+  int numStreams;
+  // The most recent user stream. Ignored if streams==nullptr
+  hipStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
 };

 #endif
@@ -14,12 +14,13 @@

 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4

 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 8

 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

@@ -28,15 +29,15 @@ typedef struct {
  char* pciPath;  // Path to the PCI device in /sys.
  uint64_t guid;  // Unique identifier for the NIC chip. Important for
                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
  int speed;      // Port speed in Mbps.
  int port;       // Port number.
  float latency;  // Network latency
  int maxComms;   // Maximum number of comms we can create
  int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v5_t;
+}ncclNetProperties_v6_t;

-typedef ncclNetProperties_v5_t ncclNetProperties_t;
+typedef ncclNetProperties_v6_t ncclNetProperties_t;

 typedef struct {
  // Name of the network (mainly for logs)
@@ -46,7 +47,103 @@ typedef struct {
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef ncclNet_v6_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+typedef ncclCollNet_v6_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@@ -83,10 +180,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v5_t;

-typedef ncclNet_v5_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
-
+// v5 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
@@ -96,7 +190,7 @@ typedef struct {
  // If ndev returns 0, all other functions might be set to NULL.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create connections.
@@ -125,10 +219,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v5_t;

-typedef ncclCollNet_v5_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
-
+// v4 struct for backwards compatibility
 typedef struct {
  char* name;     // Used mostly for logging.
  char* pciPath;  // Path to the PCI device in /sys.
@@ -140,6 +231,7 @@ typedef struct {
  int maxComms;   // Maximum number of comms we can create
 } ncclNetProperties_v4_t;

+// v4 struct for backwards compatibility
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -179,6 +271,7 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;

+// v4 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
@@ -9,33 +9,36 @@

 #include "nccl.h"
 #include "nccl_net.h"
+#include "comm.h"
 #include "checks.h"

-extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

-ncclResult_t ncclNetInit();
-int ncclNetVersion();
+ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetInit(struct ncclComm* comm);
+int ncclNetVersion(struct ncclComm* comm);

 // Translation to external API
-static const char* ncclNetName() { return ncclNet->name; }
-static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
+static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }

 // Test whether the current GPU support GPU Direct RDMA.
-ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);

 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
@@ -8,7 +8,7 @@

 #include "nvToolsExt.h"

-#include "cuda.h"
+#include "hip/hip_runtime.h"

 #ifndef NVTOOLSEXT_CUDA_V3
 #define NVTOOLSEXT_CUDA_V3
@@ -42,10 +42,10 @@ extern "C" {
 */
 typedef enum nvtxResourceCUDAType_t
 {
-    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
-    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
-    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
-    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* hipDevice_t */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* hipCtx_t */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* hipStream_t */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* hipEvent_t */
 } nvtxResourceCUDAType_t;


@@ -59,8 +59,8 @@ typedef enum nvtxResourceCUDAType_t
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -73,16 +73,16 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
 *
 * \par Example:
 * \code
- * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
- * if ( CUDA_SUCCESS != status )
+ * hipError_t status = hipCtxCreate( &cuContext, 0, cuDevice );
+ * if ( hipSuccess != status )
 *     goto Error;
 * nvtxNameCuContext(cuContext, "CTX_NAME");
 * \endcode
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -95,8 +95,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -109,8 +109,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name);
 /** @} */

 /** @} */ /* END RESOURCE_NAMING */
@@ -8,8 +8,8 @@

 #include "nvToolsExt.h"

-#include "cuda.h"
-#include "driver_types.h"
+#include "hip/hip_runtime.h"
+#include "hip/driver_types.h"

 #ifndef NVTOOLSEXT_CUDART_V3
 #define NVTOOLSEXT_CUDART_V3
@@ -44,8 +44,8 @@ extern "C" {
 typedef enum nvtxResourceCUDARTType_t
 {
    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
-    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
-    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* hipStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* hipEvent_t */
 } nvtxResourceCUDARTType_t;


@@ -73,8 +73,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -87,8 +87,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name);
 /** @} */

 /** @} */ /* END RESOURCE_NAMING */
@@ -16,10 +16,10 @@ extern "C" {

 typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
 typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
-typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
-typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(hipStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(hipEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);

 NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
 {
@@ -39,7 +39,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
@@ -48,7 +48,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
@@ -57,7 +57,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
@@ -66,7 +66,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
@@ -15,16 +15,16 @@
 extern "C" {
 #endif /* __cplusplus */

-typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
-typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
-typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
-typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
-typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(hipDevice_t device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(hipDevice_t device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(hipCtx_t context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(hipCtx_t context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(hipStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(hipEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);

-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
@@ -33,7 +33,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
@@ -42,7 +42,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
@@ -51,7 +51,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
@@ -60,7 +60,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
@@ -69,7 +69,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
@@ -78,7 +78,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
@@ -87,7 +87,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
@@ -18,7 +18,7 @@
 /* ------ Dependency-free types binary-compatible with real types ------- */

 /* In order to avoid having the NVTX core API headers depend on non-NVTX
-*  headers like cuda.h, NVTX defines binary-compatible types to use for
+*  headers like hip/hip_runtime.h, NVTX defines binary-compatible types to use for
 *  safely making the initialization versions of all NVTX functions without
 *  needing to have definitions for the real types. */

@@ -9,21 +9,4 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_

-struct ncclP2Pinfo {
-  void* buff;
-  ssize_t nbytes;
-  uint64_t opCount;
-};
-
-typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
-
-static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes, uint64_t opCount) {
-  if (p2p == NULL) p2p = new ncclP2Plist();
-  struct ncclP2Pinfo* next;
-  NCCLCHECK(p2p->getNewElem(&next));
-  next->buff = buff;
-  next->nbytes = nBytes;
-  next->opCount = opCount;
-  return ncclSuccess;
-}
 #endif
@@ -26,18 +26,26 @@ struct ncclProxyOp {
  int channelId;
  int nsteps;
  ssize_t nbytes;
-  int root;
+  struct {
+    int root:30;
+    uint32_t connIndex:2;
+  };
  int next;

  uint64_t opCount;
  int sliceSteps;
  int chunkSteps;
  int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern; // uint8_t
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
-  uint16_t connIndex;
+
+  union {
+    uint64_t unused;
+    // For use by enqueue.cc
+    struct ncclProxyOp *enqNext;
+  };
 };
 static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");

@@ -73,9 +81,9 @@ struct ncclProxyArgs {
  int sliceSteps;
  int chunkSteps;
  int chunkSize;
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  ncclPattern_t pattern;
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
  int state;
  char* sharedBuff[NCCL_STEPS];
@@ -164,6 +172,7 @@ struct ncclProxyState {
  pthread_t thread;
  struct ncclSocket* listenSock;
  int stop;
+  hipCtx_t cudaCtx;

  // Used by main thread
  union ncclSocketAddress* peerAddresses;
@@ -193,9 +202,8 @@ enum proxyMode {
  proxyTo = 2
 };

-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
 ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ROCMWRAP_H_
+#define NCCL_ROCMWRAP_H_
+
+#include <hsa/hsa.h>
+
+typedef hsa_status_t (*PFN_hsa_init)();
+typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
+typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
+typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
+
+
+#define CUPFN(symbol) pfn_##symbol
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    hsa_status_t err = pfn_##cmd;				      \
+    if( err != HSA_STATUS_SUCCESS ) {				      \
+      const char *errStr;				      \
+      pfn_hsa_status_string(err, &errStr);	      \
+      WARN("ROCr failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    hsa_status_t err = pfn_##cmd;				      \
+    if( err != HSA_STATUS_SUCCESS ) {				      \
+      const char *errStr;				      \
+      pfn_hsa_status_string(err, &errStr);	      \
+      WARN("ROCr failure '%s'", errStr);		      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    hsa_status_t err = pfn_##cmd;						\
+    if( err != HSA_STATUS_SUCCESS ) {						\
+      const char *errStr;						\
+      pfn_hsa_status_string(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr);	\
+    }									\
+} while(false)
+
+#define CUCHECKTHREAD(cmd, args) do {					\
+    hsa_status_t err = pfn_##cmd;						\
+    if (err != HSA_STATUS_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+
+#define DECLARE_ROCM_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+
+DECLARE_ROCM_PFN_EXTERN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
+
+/* ROCr Driver functions loaded with dlsym() */
+DECLARE_ROCM_PFN_EXTERN(hsa_init);
+DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
+DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
+
+ncclResult_t rocmLibraryInit(void);
+
+#endif
@@ -0,0 +1,142 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_STRONGSTREAM_H_
+#define NCCL_STRONGSTREAM_H_
+
+#include "nccl.h"
+#include "checks.h"
+
+#include <stdint.h>
+
+/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
+ * easily.
+ */
+struct ncclCudaGraph {
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graph;
+  uint64_t graphId;
+#endif
+};
+
+inline struct ncclCudaGraph ncclCudaGraphNull() {
+  struct ncclCudaGraph tmp;
+  #if CUDART_VERSION >= 11030
+    tmp.graph = nullptr;
+    tmp.graphId = ULLONG_MAX;
+  #endif
+  return tmp;
+}
+
+inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
+  #if CUDART_VERSION >= 11030
+    return graph.graph != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
+  #if CUDART_VERSION >= 11030
+    return a.graphId == b.graphId;
+  #else
+    return true;
+  #endif
+}
+
+ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, hipStream_t stream);
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg);
+
+
+/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
+ * identity while being captured. Regular streams have the deficiency that the
+ * captured form of a stream in one graph launch has no relation to the
+ * uncaptured stream or to the captured form in other graph launches. This makes
+ * streams unfit for the use of serializing access to a persistent resource.
+ * Strong streams have been introduced to address this need.
+ *
+ * Constraints of using strong streams:
+ *
+ * - Operations that enqueue work to the strong stream need to be enclosed by
+ *   ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences,
+ *   the strong stream is not stateful so there is no harm in redundant acquire
+ *   or releases.
+ *
+ * - An {Acquire; ...; Release} sequence must not be concurrent with any
+ *   other operations against the strong stream including graph launches which
+ *   reference this stream.
+ *
+ * - All strong stream functions take a "graph" parameter which must reference
+ *   the currently capturing graph, or null if none.
+ */
+struct ncclStrongStream;
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
+
+// Has this strong stream ever been captured in a graph.
+bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss);
+
+// Acquire-fence the strong stream.
+ncclResult_t ncclStrongStreamAcquire(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+);
+
+// Acquire-fence the strong stream assuming no graph is capturing. This permits
+// the caller to enqueue directly to the `ss->stream` member using native CUDA
+// calls. Strong stream must be released via:
+//   ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
+
+// Release-fence of the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
+
+// Add a host launch to the stream.
+ncclResult_t ncclStrongStreamLaunchHost(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  hipHostFn_t fn, void* arg
+);
+// Add a kernel launch to the stream.
+ncclResult_t ncclStrongStreamLaunchKernel(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+);
+// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+);
+// `b` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
+);
+// `a` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
+);
+
+// Synchrnoization does not need the strong stream to be acquired.
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclStrongStream {
+  hipStream_t stream;
+  hipEvent_t event;
+  #if CUDART_VERSION >= 11030
+  cudaGraphNode_t node; // null if never captured, otherwise never null again
+  uint64_t graphId:63, eventIsLagging:1;
+  #endif
+};
+
+inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    return ss->node != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+#endif
@@ -21,7 +21,12 @@

 #include "proxy.h"

-extern struct ncclTransport ncclTransports[];
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
+
+extern struct ncclTransport* ncclTransports[];

 // Forward declarations
 struct ncclRing;
@@ -66,7 +71,7 @@ struct ncclTransport {
  struct ncclTransportComm recv;
 };

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);

 enum { collNetRecv=0, collNetSend=1 };
@@ -8,8 +8,12 @@
 #define NCCL_UTILS_H_

 #include "nccl.h"
+#include "alloc.h"
 #include "checks.h"
 #include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <new>

 int ncclCudaCompCap();

@@ -38,81 +42,446 @@ static long log2i(long n) {
 return l;
 }

-// Recyclable list that avoids frequent malloc/free
+inline uint64_t clockNano() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename Int>
+inline void ncclAtomicRefCountIncrement(Int* refs) {
+  __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+}
+
+template<typename Int>
+inline Int ncclAtomicRefCountDecrement(Int* refs) {
+  return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
+ * granularity of LIFO is not per object, instead frames containing many objects
+ * are pushed and popped. Therefor deallocation is extremely cheap since its
+ * done at the frame granularity.
+ *
+ * The initial state of the stack is with one frame, the "nil" frame, which
+ * cannot be popped. Therefor objects allocated in the nil frame cannot be
+ * deallocated sooner than stack destruction.
+ */
+struct ncclMemoryStack;
+
+void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
+void ncclMemoryStackPush(struct ncclMemoryStack* me);
+void ncclMemoryStackPop(struct ncclMemoryStack* me);
 template<typename T>
-struct ncclListElem {
-  T data;
-  struct ncclListElem* next;
+T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
+ * a pool instance to ever hold objects whose type have differing
+ * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
+ * a backing `ncclMemoryStack` passed during Alloc(). If memory
+ * backing any currently held object is deallocated then it is an error to do
+ * anything other than reconstruct it, after which it is a valid empty pool.
+ */
+struct ncclMemoryPool;
+
+// Equivalent to zero-initialization
+void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
+template<typename T>
+T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
+template<typename T>
+void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
+void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
+ * field is given via the `next` template argument.
+ *
+ * Example:
+ *   struct Foo {
+ *     struct Foo *next1, *next2; // can be a member of two lists at once
+ *   };
+ *   ncclIntruQueue<Foo, &Foo::next1> list1;
+ *   ncclIntruQueue<Foo, &Foo::next2> list2;
+ */
+template<typename T, T *T::*next>
+struct ncclIntruQueue;
+
+template<typename T, T *T::*next>
+void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
+T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
+ * and "cond" fields are part of the public interface.
+ */
+struct ncclThreadSignal {
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
 };

-template<typename T>
-class ncclRecyclableList {
- private:
-  struct ncclListElem<T>* head;
-  struct ncclListElem<T>* tail;
-  struct ncclListElem<T>* cursor;
-  int n;
+// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();

- public:
-  ncclRecyclableList() {
-    tail = cursor = head = NULL;
-    n = 0;
-  }
+void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
+void ncclThreadSignalDestruct(struct ncclThreadSignal* me);

-  int count() const { return n; }
+// A convenience instance per-thread.
+extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;

-  // Get a new element from the list and return pointer
-  ncclResult_t getNewElem(T** dataOut) {
-    if (tail != NULL) {
-      *dataOut = &tail->data;
-      memset(*dataOut, 0, sizeof(T));
-    } else {
-      NCCLCHECK(ncclCalloc(&tail, 1));
-      *dataOut = &tail->data;
-      cursor = head = tail;
-    }
-    if (tail->next == NULL) {
-      NCCLCHECK(ncclCalloc(&tail->next, 1));
-    }
-    tail = tail->next;
-    n += 1;
-    return ncclSuccess;
-  }
+////////////////////////////////////////////////////////////////////////////////

-  T* begin() {
-    if (head == NULL || head == tail) return NULL;
-    cursor = head->next;
-    return &head->data;
-  }
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc;

-  // Get next element from the list during an iteration
-  T* getNext() {
-    // tail always points to the next element to be enqueued
-    // hence does not contain valid data
-    if (cursor == NULL || cursor == tail) return NULL;
-    T* rv = &cursor->data;
-    cursor = cursor->next;
-    return rv;
-  }
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
+// Enqueue element. Returns true if queue is not abandoned. Even if queue is
+// abandoned the element enqueued, so the caller needs to make arrangements for
+// the queue to be tended.
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
+// Dequeue all elements at a glance. If there aren't any and `waitSome` is
+// true then this call will wait until it can return a non empty list.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
+// Dequeue all elements and set queue to abandoned state.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);

-  T* peakNext() {
-    if (cursor == NULL || cursor == tail) return NULL;
-    return &cursor->data;
-  }
+////////////////////////////////////////////////////////////////////////////////

-  // Recycle the list without freeing the space
-  void recycle() {
-    tail = cursor = head;
-    n = 0;
-  }
+struct ncclMemoryStack {
+  struct Hunk {
+    struct Hunk* above; // reverse stack pointer
+    size_t size; // size of this allocation (including this header struct)
+  };
+  struct Unhunk { // proxy header for objects allocated out-of-hunk
+    struct Unhunk* next;
+    void* obj;
+  };
+  struct Frame {
+    struct Hunk* hunk; // top of non-empty hunks
+    uintptr_t bumper, end; // points into top hunk
+    struct Unhunk* unhunks;
+    struct Frame* below;
+  };

-  ~ncclRecyclableList() {
-    while (head != NULL) {
-      struct ncclListElem<T>* temp = head;
-      head = head->next;
-      free(temp);
-    }
-  }
+  static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
+  static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
+
+  struct Hunk stub;
+  struct Frame topFrame;
 };

+inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
+  me->stub.above = nullptr;
+  me->stub.size = 0;
+  me->topFrame.hunk = &me->stub;
+  me->topFrame.bumper = 0;
+  me->topFrame.end = 0;
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = nullptr;
+}
+
+inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
+  uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+  void* obj;
+  if (__builtin_expect(o + size <= me->topFrame.end, true)) {
+    me->topFrame.bumper = o + size;
+    obj = reinterpret_cast<void*>(o);
+  } else {
+    obj = allocateSpilled(me, size, align);
+  }
+  return obj;
+}
+
+template<typename T>
+inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
+  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
+  memset(obj, 0, n*sizeof(T));
+  return (T*)obj;
+}
+
+inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
+  using Frame = ncclMemoryStack::Frame;
+  Frame tmp = me->topFrame;
+  Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
+  *snapshot = tmp; // C++ struct assignment
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = snapshot;
+}
+
+inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
+  ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
+  while (un != nullptr) {
+    free(un->obj);
+    un = un->next;
+  }
+  me->topFrame = *me->topFrame.below; // C++ struct assignment
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclMemoryPool {
+  struct Cell {
+    Cell *next;
+  };
+  template<int Size, int Align>
+  union CellSized {
+    Cell cell;
+    alignas(Align) char space[Size];
+  };
+  struct Cell* head;
+  struct Cell* tail; // meaningful only when head != nullptr
+};
+
+inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
+  me->head = nullptr;
+}
+
+template<typename T>
+inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
+  using Cell = ncclMemoryPool::Cell;
+  using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
+  Cell* cell;
+  if (__builtin_expect(me->head != nullptr, true)) {
+    cell = me->head;
+    me->head = cell->next;
+  } else {
+    // Use the internal allocate() since it doesn't memset to 0 yet.
+    cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+  }
+  memset(cell, 0, sizeof(T));
+  return reinterpret_cast<T*>(cell);
+}
+
+template<typename T>
+inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
+  using Cell = ncclMemoryPool::Cell;
+  Cell* cell = reinterpret_cast<Cell*>(obj);
+  cell->next = me->head;
+  if (me->head == nullptr) me->tail = cell;
+  me->head = cell;
+}
+
+inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
+  if (from->head != nullptr) {
+    from->tail->next = me->head;
+    if (me->head == nullptr) me->tail = from->tail;
+    me->head = from->head;
+    from->head = nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  if (ans != nullptr) {
+    me->head = ans->*next;
+    if (me->head == nullptr) me->tail = nullptr;
+  }
+  return ans;
+}
+
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
+  T *head = me->head;
+  me->head = nullptr;
+  me->tail = nullptr;
+  while (head != nullptr) {
+    T *tmp = head->*next;
+    ncclMemoryPoolFree(pool, tmp);
+    head = tmp;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
+  return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
+}
+
+inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
+  pthread_mutex_init(&me->mutex, nullptr);
+  pthread_cond_init(&me->cond, nullptr);
+}
+
+inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
+  pthread_mutex_destroy(&me->mutex);
+  pthread_cond_destroy(&me->cond);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc {
+  T* head;
+  uintptr_t tail;
+  struct ncclThreadSignal* waiting;
+};
+
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
+  me->head = nullptr;
+  me->tail = 0x0;
+  me->waiting = nullptr;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
+  return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
+  __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
+  T* prev = reinterpret_cast<T*>(utail);
+  T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
+  __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
+  if (utail == 0x1) { // waiting
+    __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
+    // This lock/unlock is essential to ensure we don't race ahead of the consumer
+    // and signal the cond before they begin waiting on it.
+    struct ncclThreadSignal* waiting = me->waiting;
+    pthread_mutex_lock(&waiting->mutex);
+    pthread_mutex_unlock(&waiting->mutex);
+    pthread_cond_broadcast(&waiting->cond);
+  }
+  return utail != 0x2; // not abandoned
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
+  T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+  if (head == nullptr) {
+    if (!waitSome) return nullptr;
+    uint64_t t0 = clockNano();
+    bool sleeping = false;
+    do {
+      if (clockNano()-t0 >= 10*1000) { // spin for first 10us
+        struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
+        pthread_mutex_lock(&waitSignal->mutex);
+        uintptr_t expected = sleeping ? 0x1 : 0x0;
+        uintptr_t desired = 0x1;
+        me->waiting = waitSignal; // release done by successful compare exchange
+        if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+          sleeping = true;
+          pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
+        }
+        pthread_mutex_unlock(&waitSignal->mutex);
+      }
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+    } while (head == nullptr);
+  }
+
+  __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
+  T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+  T *x = head;
+  while (x != tail) {
+    T *x1;
+    int spins = 0;
+    while (true) {
+      x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+      if (x1 != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    x = x1;
+  }
+  return head;
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
+  uintptr_t expected = 0x0;
+  if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+    return nullptr;
+  } else {
+    int spins = 0;
+    T* head;
+    while (true) {
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+      if (head != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+    uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
+    T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+    T *x = head;
+    while (x != tail) {
+      T *x1;
+      spins = 0;
+      while (true) {
+        x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+        if (x1 != nullptr) break;
+        if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+      }
+      x = x1;
+    }
+    return head;
+  }
+}
 #endif
@@ -45,12 +45,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
    return ncclInvalidArgument;
  }
  // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P/AllToAllPivot calls to chars.
-  info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
-    info->count = info->nBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+  NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));

  if (info->op < 0 || ncclMaxRedOp < info->op) {
    WARN("%s : invalid reduction operation %d", info->opName, info->op);
@@ -0,0 +1,163 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "debug.h"
+#include "cudawrap.h"
+
+#include <dlfcn.h>
+
+#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN(cuDeviceGet);
+DECLARE_CUDA_PFN(cuDeviceGetAttribute);
+DECLARE_CUDA_PFN(cuGetErrorString);
+DECLARE_CUDA_PFN(cuGetErrorName);
+/* enqueue.cc */
+DECLARE_CUDA_PFN(cuMemGetAddressRange);
+/* proxy.cc */
+DECLARE_CUDA_PFN(cuCtxCreate_v3020);
+DECLARE_CUDA_PFN(cuCtxDestroy);
+DECLARE_CUDA_PFN(cuCtxSetCurrent);
+#if CUDA_VERSION >= 11070
+/* transport/collNet.cc/net.cc*/
+DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN(cuInit);
+DECLARE_CUDA_PFN(cuDriverGetVersion);
+DECLARE_CUDA_PFN(cuGetProcAddress);
+
+static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
+
+#define CUDA_DRIVER_MIN_VERSION 11030
+
+static void *cudaLib;
+static int cudaDriverVersion;
+
+#if CUDART_VERSION >= 11030
+/*
+  Load the CUDA symbols
+ */
+static int cudaPfnFuncLoader(void) {
+  CUresult res;
+
+#define LOAD_SYM(symbol, ignore) do {                                   \
+    res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
+    if (res != 0) {                                                     \
+      if (!ignore) {                                                    \
+        WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
+        return ncclSystemError; }                                       \
+    } } while(0)
+
+  LOAD_SYM(cuGetErrorString, 0);
+  LOAD_SYM(cuGetErrorName, 0);
+  LOAD_SYM(cuDeviceGet, 0);
+  LOAD_SYM(cuDeviceGetAttribute, 0);
+  LOAD_SYM(cuMemGetAddressRange, 1);
+  LOAD_SYM(cuCtxCreate_v3020, 1);
+  LOAD_SYM(cuCtxDestroy, 1);
+  LOAD_SYM(cuCtxSetCurrent, 1);
+#if CUDA_VERSION >= 11070
+  LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
+#endif
+  return ncclSuccess;
+}
+#endif
+
+ncclResult_t cudaLibraryInit(void) {
+  CUresult res;
+
+  if (cudaState == cudaInitialized)
+    return ncclSuccess;
+  if (cudaState == cudaError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (cudaState == cudaInitializing) sched_yield();
+    return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  /*
+   * Load CUDA driver library
+   */
+  char path[1024];
+  char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
+  if (ncclCudaPath == NULL)
+    snprintf(path, 1024, "%s", "libcuda.so");
+  else
+    snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
+
+  cudaLib = dlopen(path, RTLD_LAZY);
+  if (cudaLib == NULL) {
+    WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
+    goto error;
+  }
+
+  /*
+   * Load initial CUDA functions
+   */
+
+  pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
+  if (pfn_cuInit == NULL) {
+    WARN("Failed to load CUDA missing symbol cuInit");
+    goto error;
+  }
+
+  pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
+  if (pfn_cuDriverGetVersion == NULL) {
+    WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
+    goto error;
+  }
+
+  res = pfn_cuDriverGetVersion(&cudaDriverVersion);
+  if (res != 0) {
+    WARN("cuDriverGetVersion failed with %d", res);
+    goto error;
+  }
+
+  INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
+
+  if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
+    // WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
+    // Silently ignore version check mismatch for backwards compatibility
+    goto error;
+  }
+
+  pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
+  if (pfn_cuGetProcAddress == NULL) {
+    WARN("Failed to load CUDA missing symbol cuGetProcAddress");
+    goto error;
+  }
+
+  /*
+   * Required to initialize the CUDA Driver.
+   * Multiple calls of cuInit() will return immediately
+   * without making any relevant change
+   */
+  pfn_cuInit(0);
+
+#if CUDART_VERSION >= 11030
+  if (cudaPfnFuncLoader()) {
+    WARN("CUDA some PFN functions not found in the library");
+    goto error;
+  }
+#endif
+
+  cudaState = cudaInitialized;
+  return ncclSuccess;
+
+error:
+  cudaState = cudaError;
+  return ncclSystemError;
+}
+
+
@@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) {

  if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
    // Another thread raced in front of us. Wait for it to be done.
-    while (gdrState == gdrInitializing) pthread_yield();
+    while (gdrState == gdrInitializing) sched_yield();
    return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
  }

@@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
 int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
 struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
 int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
 struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) {

  if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
    // Another thread raced in front of us. Wait for it to be done.
-    while (ibvState == ibvInitializing) pthread_yield();
+    while (ibvState == ibvInitializing) sched_yield();
    return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
  }

@@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) {
  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
+  // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
  LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -126,6 +130,7 @@ teardown:
  ibv_internal_dealloc_pd = NULL;
  ibv_internal_reg_mr = NULL;
  ibv_internal_reg_mr_iova2 = NULL;
+  ibv_internal_reg_dmabuf_mr = NULL;
  ibv_internal_dereg_mr = NULL;
  ibv_internal_create_cq = NULL;
  ibv_internal_destroy_cq = NULL;
@@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or
 }

 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
-  IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
 }

 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
@@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void
    return ncclInternalError;
  }
  if (ret == NULL) { return ncclSuccess; } // Assume dummy call
-  IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
+}
+
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
+  if (ibv_internal_reg_dmabuf_mr == NULL) {
+    return NULL;
+  }
+  return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
 }

 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
@@ -0,0 +1,119 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "debug.h"
+#include "rocmwrap.h"
+
+#include <dlfcn.h>
+
+#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+
+DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
+
+/* ROCr Driver functions loaded with dlsym() */
+DECLARE_ROCM_PFN(hsa_init);
+DECLARE_ROCM_PFN(hsa_system_get_info);
+DECLARE_ROCM_PFN(hsa_status_string);
+
+static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaState = hsaUninitialized;
+
+static void *hsaLib;
+static uint16_t version_major, version_minor;
+
+ncclResult_t rocmLibraryInit(void) {
+  hsa_status_t res;
+
+  if (hsaState == hsaInitialized)
+    return ncclSuccess;
+  if (hsaState == hsaError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&hsaState, hsaUninitialized, hsaInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (hsaState == hsaInitializing) sched_yield();
+    return (hsaState == hsaInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  /*
+   * Load ROCr driver library
+   */
+  char path[1024];
+  char *ncclCudaPath = getenv("RCCL_ROCR_PATH");
+  if (ncclCudaPath == NULL)
+    snprintf(path, 1024, "%s", "libhsa-runtime64.so");
+  else
+    snprintf(path, 1024, "%s%s", ncclCudaPath, "libhsa-runtime64.so");
+
+  hsaLib = dlopen(path, RTLD_LAZY);
+  if (hsaLib == NULL) {
+    WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", ncclCudaPath, ncclCudaPath);
+    goto error;
+  }
+
+  /*
+   * Load initial ROCr functions
+   */
+
+  pfn_hsa_init = (PFN_hsa_init) dlsym(hsaLib, "hsa_init");
+  if (pfn_hsa_init == NULL) {
+    WARN("Failed to load ROCr missing symbol hsa_init");
+    goto error;
+  }
+
+  pfn_hsa_system_get_info = (PFN_hsa_system_get_info) dlsym(hsaLib, "hsa_system_get_info");
+  if (pfn_hsa_system_get_info == NULL) {
+    WARN("Failed to load ROCr missing symbol hsa_system_get_info");
+    goto error;
+  }
+
+  pfn_hsa_status_string = (PFN_hsa_status_string) dlsym(hsaLib, "hsa_status_string");
+  if (pfn_hsa_status_string == NULL) {
+    WARN("Failed to load ROCr missing symbol hsa_status_string");
+    goto error;
+  }
+
+  res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
+  if (res != 0) {
+    WARN("pfn_hsa_system_get_info failed with %d", res);
+    goto error;
+  }
+  res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
+  if (res != 0) {
+    WARN("pfn_hsa_system_get_info failed with %d", res);
+    goto error;
+  }
+
+  INFO(NCCL_INIT, "ROCr version %d.%d", version_major, version_minor);
+
+  //if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
+    // WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
+    // Silently ignore version check mismatch for backwards compatibility
+    //goto error;
+  //}
+
+  pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
+  if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
+    WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
+    goto error;
+  }
+  /*
+   * Required to initialize the ROCr Driver.
+   * Multiple calls of hsa_init() will return immediately
+   * without making any relevant change
+   */
+  pfn_hsa_init();
+
+  hsaState = hsaInitialized;
+  return ncclSuccess;
+
+error:
+  hsaState = hsaError;
+  return ncclSystemError;
+}
+
+
@@ -59,15 +59,15 @@ ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void**

  NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
  if (devShmPtr) {
-    CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, cudaError);
-    CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+    CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, hipError_t);
+    CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
  }

  *shmPtr = ptr;
  return ncclSuccess;
 sysError:
  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
-cudaError:
+hipError_t:
  if (fd != -1) close(fd);
  if (create) shm_unlink(shmPath);
  if (ptr != MAP_FAILED) munmap(ptr, shmSize);
@@ -15,6 +15,9 @@
 #include <vector>
 #include <utility>
 #include <unordered_set>
+#include <unistd.h>
+#include <sys/syscall.h>
+
 static std::vector<std::pair<int, std::unordered_set<std::string>>> clientPortPool;

 /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
@@ -337,9 +340,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
 #endif
  }

-  /* make all new sockets non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }

  // addr port should be 0 (Any port)
  SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
@@ -378,7 +382,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
      SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
    }

-    if (ret == EINPROGRESS)
+    if (ret == EINPROGRESS || ret == ECONNREFUSED)
      *state = ncclSocketConnecting;
    else if (ret == 0)
      *state = ncclSocketConnected;
@@ -414,10 +418,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {

  const int one = 1;
  SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
+  
  /* support non-blocking socket; by default, the socket is non-blocking */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  if (sock->asyncFlag) {
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }

  /*  const int bufsize = 128*1024;
    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
@@ -458,31 +464,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
  int timedout_retries = 0;
  int refused_retries = 0;
 retry:
-  /* async connect; abort when error happens and abortFlag is present. */
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
  ret = connect(fd, &sock->addr.sa, salen);

-  if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-    if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+  if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
+    if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
    usleep(SLEEP_INT);
    goto retry;
-  } else if (errno == EINPROGRESS && !sock->asyncFlag) {
-    enum ncclSocketState state;
-    do {
-      if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
-      NCCLCHECK(getFdState(fd, &state));
-    } while (state == ncclSocketConnecting);
-    EQCHECK(state, ncclSocketError);
-    ret = 0;
  }

-  if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+  /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
+   * However, it can return EISCONN instead of success which indicates connection is built up in
+   * background already. No need to call connect() again. */
+  if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
    sock->fd = fd;
    return ncclSuccess;
  }

  WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-  return ncclSystemError;
+  return ncclRemoteError;
 }

 ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
@@ -535,7 +536,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
    if (bytes == -1) {
      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
        WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-        return ncclSystemError;
+        return ncclRemoteError;
      } else {
        bytes = 0;
      }
@@ -555,7 +556,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
  if (closed) {
    char line[SOCKET_NAME_MAXLEN+1];
    WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclSystemError;
+    return ncclRemoteError;
  }
  return ncclSuccess;
 }
@@ -0,0 +1,273 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "strongstream.h"
+#include "checks.h"
+#include "param.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclCudaGetCapturingGraph(
+    struct ncclCudaGraph* graph, hipStream_t stream
+  ) {
+  #if CUDART_VERSION >= 11030
+    thread_local int driver = -1;
+    if (driver == -1) {
+      CUDACHECK(cudaDriverGetVersion(&driver));
+    }
+    if (driver < 11030) {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      graph->graph = nullptr;
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      if (status != cudaStreamCaptureStatusNone) {
+        WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+        return ncclInvalidUsage;
+      }
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+      if (status != cudaStreamCaptureStatusActive) {
+        graph->graph = nullptr;
+        gid = ULLONG_MAX;
+      }
+      graph->graphId = gid;
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg) {
+  #if CUDART_VERSION >= 11030
+    cudaUserObject_t object;
+    CUDACHECK(cudaUserObjectCreate(
+      &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync
+    ));
+    // Hand over ownership to CUDA Graph
+    CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove));
+    return ncclSuccess;
+  #else
+    return ncclInvalidUsage;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
+  CUDACHECK(hipStreamCreateWithFlags(&ss->stream, hipStreamNonBlocking));
+  CUDACHECK(hipEventCreateWithFlags(&ss->event, hipEventDisableTiming));
+  #if CUDART_VERSION >= 11030
+    ss->node = nullptr;
+    ss->graphId = (1ull<<(8*sizeof(long long)-1))-1;
+    ss->eventIsLagging = 0;
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaEventDestroy(ss->event));
+  #endif
+  CUDACHECK(hipStreamDestroy(ss->stream));
+  return ncclSuccess;
+}
+
+NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+
+ncclResult_t ncclStrongStreamAcquire(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  ) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (graph.graph == nullptr) {
+      if (mixing && ncclStrongStreamEverCaptured(ss)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+        ss->eventIsLagging = 0;
+      }
+    } else {
+      if (ss->graphId != graph.graphId) {
+        if (mixing && ss->eventIsLagging) {
+          // Can only be here if previous release was for uncaptured work that
+          // elided updating the event because no capture had yet occurred.
+          CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+        }
+        ss->graphId = graph.graphId;
+        ss->eventIsLagging = 0;
+        if (mixing) {
+          CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event));
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0));
+        }
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ncclStrongStreamEverCaptured(ss)) {
+      CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+    }
+    ss->eventIsLagging = 1; // Assume the caller is going to add work to stream.
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing && ss->eventIsLagging) {
+      if (graph.graph == nullptr) {
+        if (ncclStrongStreamEverCaptured(ss)) {
+          CUDACHECK(cudaEventRecord(ss->event, ss->stream));
+          ss->eventIsLagging = 0;
+        }
+      } else {
+        CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event));
+        ss->eventIsLagging = 0;
+      }
+    }
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchHost(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, hipHostFn_t fn, void* arg
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
+    } else {
+      cudaHostNodeParams p;
+      p.fn = fn;
+      p.userData = arg;
+      CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    //CUDACHECK(hipLaunchHostFunc(ss->stream, fn, arg));
+    CUDACHECK(hipStreamAddCallback(ss->stream, (hipStreamCallback_t)fn, arg, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamLaunchKernel(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+    } else {
+      cudaGraphNode_t tip = ss->node;
+      cudaKernelNodeParams p;
+      p.func = fn;
+      p.gridDim = grid;
+      p.blockDim = block;
+      p.kernelParams = args;
+      p.sharedMemBytes = sharedMemBytes;
+      p.extra = nullptr;
+      CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p));
+    }
+    ss->eventIsLagging = 1;
+  #else
+    CUDACHECK(hipLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
+      a->eventIsLagging = 1;
+    } else {
+      cudaGraphNode_t pair[2] = {a->node, b->node};
+      CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+    }
+  #else
+    CUDACHECK(hipEventRecord(b->event, b->stream));
+    CUDACHECK(hipStreamWaitEvent(a->stream, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      CUDACHECK(cudaEventRecord(a->event, b));
+      CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
+      // We used a->event to record b so it no longer reflects anything about a.
+      a->eventIsLagging = 1;
+    } else {
+      cudaStreamCaptureStatus status;
+      unsigned long long gid1;
+      cudaGraphNode_t const* deps;
+      size_t depN = 0;
+      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN));
+      if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) {
+        WARN("Stream is not being captured by the expected graph.");
+        return ncclInvalidUsage;
+      }
+      if (depN > 0 && (depN > 1 || deps[0] != a->node)) {
+        cudaGraphNode_t tie;
+        if (depN == 1) {
+          tie = deps[0];
+        } else {
+          CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN));
+        }
+        cudaGraphNode_t pair[2] = {a->node, tie};
+        CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
+      }
+      // a->eventIsLagging doesn't change since we are just updating the
+      // dependencies of a->node.
+    }
+  #else
+    CUDACHECK(hipEventRecord(a->event, b));
+    CUDACHECK(hipStreamWaitEvent(a->stream, a->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamWaitStream(
+    struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
+  ) {
+  #if CUDART_VERSION >= 11030
+    if (graph.graph == nullptr) {
+      if (b->eventIsLagging) {
+        b->eventIsLagging = 0;
+        CUDACHECK(cudaEventRecord(b->event, b->stream));
+      }
+      CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
+    } else {
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies));
+    }
+  #else
+    CUDACHECK(hipEventRecord(b->event, b->stream));
+    CUDACHECK(hipStreamWaitEvent(a, b->event, 0));
+  #endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
+  #if CUDART_VERSION >= 11030
+    CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
+  #endif
+  CUDACHECK(hipStreamSynchronize(ss->stream));
+  return ncclSuccess;
+}
@@ -11,6 +11,8 @@
 #include "nvmlwrap.h"
 #include <hip/hip_runtime.h>

+#include <stdlib.h>
+
 // Get current Compute Capability
 int ncclCudaCompCap() {
  int cudaDev;
@@ -192,3 +194,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
  }
  return false;
 }
+
+__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
+
+void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
+  // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
+  // this (reachable via `->above`) are empty.
+  struct Hunk* top = me->topFrame.hunk;
+  size_t mallocSize = 0;
+
+  // If we have lots of space left in hunk but that wasn't enough then we'll
+  // allocate the object unhunked.
+  if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
+    goto unhunked;
+
+  // If we have another hunk (which must be empty) waiting above this one and
+  // the object fits then use that.
+  if (top && top->above) {
+    struct Hunk* top1 = top->above;
+    uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
+    if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
+      me->topFrame.hunk = top1;
+      me->topFrame.bumper = uobj + size;
+      me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+  { // If the next hunk we're going to allocate wouldn't be big enough but the
+    // Unhunk proxy fits in the current hunk then go allocate as unhunked.
+    size_t nextSize = (top ? top->size : 0) + (64<<10);
+    constexpr size_t maxAlign = 64;
+    if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
+      uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+      if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
+        goto unhunked;
+    }
+
+    // At this point we must need another hunk, either to fit the object
+    // itself or its Unhunk proxy.
+    mallocSize = nextSize;
+    INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
+    if (top1 == nullptr) goto malloc_exhausted;
+    top1->size = nextSize;
+    top1->above = nullptr;
+    if (top) top->above = top1;
+    top = top1;
+    me->topFrame.hunk = top;
+    me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
+    me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
+  }
+
+  { // Try to fit object in the new top hunk.
+    uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+    if (uobj + size <= me->topFrame.end) {
+      me->topFrame.bumper = uobj + size;
+      return reinterpret_cast<void*>(uobj);
+    }
+  }
+
+unhunked:
+  { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
+    // to keep track of it.
+    uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
+    Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
+    me->topFrame.bumper = uproxy + sizeof(Unhunk);
+    proxy->next = me->topFrame.unhunks;
+    me->topFrame.unhunks = proxy;
+    mallocSize = size;
+    proxy->obj = malloc(mallocSize);
+    INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+    if (proxy->obj == nullptr) goto malloc_exhausted;
+    return proxy->obj;
+  }
+
+malloc_exhausted:
+  WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
+  abort();
+}
+
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
+  // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
+  struct ncclMemoryStack::Frame* f = &me->topFrame;
+  while (f != nullptr) {
+    struct ncclMemoryStack::Unhunk* u = f->unhunks;
+    while (u != nullptr) {
+      free(u->obj);
+      u = u->next;
+    }
+    f = f->below;
+  }
+  // Free hunks
+  struct ncclMemoryStack::Hunk* h = me->stub.above;
+  while (h != nullptr) {
+    struct ncclMemoryStack::Hunk *h1 = h->above;
+    free(h);
+    h = h1;
+  }
+}
@@ -41,7 +41,8 @@ typedef enum { ncclSuccess                 =  0,
               ncclInternalError           =  3,
               ncclInvalidArgument         =  4,
               ncclInvalidUsage            =  5,
-               ncclNumResults              =  6 } ncclResult_t;
+               ncclRemoteError             =  6,
+               ncclNumResults              =  7 } ncclResult_t;

 /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
 *
@@ -135,11 +136,21 @@ ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 /// @endcond

-/*! @brief Returns a human-readable error message. */
+/*! @brief Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
+/// @cond include_hidden 
 const char* pncclGetErrorString(ncclResult_t result);
+/// @endcond

-/*! @brief Checks whether the comm has encountered any asynchronous errors */
+/*! @brief Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+const char*  ncclGetLastError(ncclComm_t comm);
+/// @cond include_hidden 
+const char* pncclGetError(ncclComm_t comm);
+/// @endcond
+
+/* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 /// @cond include_hidden 
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -9,15 +9,16 @@
 //#include <sys/stat.h>
 //#include <unistd.h>

-ncclNet_t *ncclNet;
-ncclCollNet_t *ncclCollNet;
-
-static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v6_t ncclNet_v4_as_v6;
+static ncclNet_v6_t ncclNet_v5_as_v6;
 static ncclNet_v4_t *ncclNet_v4;
-static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclNet_v5_t *ncclNet_v5;
+static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
+static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
 static ncclCollNet_v4_t *ncclCollNet_v4;
+static ncclCollNet_v5_t *ncclCollNet_v5;

-static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
  ncclNetProperties_v4_t p4;
  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
  if (ans != ncclSuccess) return ans;
@@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
 }

-static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
  if (n == 0) return ncclSuccess;
  if (n != 1) return ncclInvalidArgument;
  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
 }

-static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
  if (n == 0) return ncclSuccess;
  if (n != 1) return ncclInvalidArgument;
  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
@@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data,

 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v4->init(logfn));
-  ncclNet_v4_as_v5.name = ncclNet_v4->name;
-  ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
-  ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
-  ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
-  ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
-  ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
-  ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
-  ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
-  ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
-  ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
-  ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
-  ncclNet_v4_as_v5.test = ncclNet_v4->test;
-  ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
-  ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
-  ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+  ncclNet_v4_as_v6.name = ncclNet_v4->name;
+  ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
+  ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
+  ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
+  ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
+  ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
+  ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
+  ncclNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
+  ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
+  ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
+  ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
+  ncclNet_v4_as_v6.test = ncclNet_v4->test;
+  ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
+  ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
+  ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v5->init(logfn));
+  ncclNet_v5_as_v6.name = ncclNet_v5->name;
+  ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
+  ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
+  ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
+  ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
+  ncclNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v6.test = ncclNet_v5->test;
+  ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
  ncclNetProperties_v4_t p4;
  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
  if (ans != ncclSuccess) return ans;
@@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie

 // We use a wrapper around the v4 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v4->init(logfn));
-  ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
-  ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
-  ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
-  ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
-  ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
-  ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
-  ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
-  ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
-  ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
-  ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
-  ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
-  ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
-  ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+  ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+  ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
+  ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
+  ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
+  ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
+  ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
+  ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
+  ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
+  ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
+  ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
+  ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
+  ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
+  ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
  return ncclSuccess;
 }

-static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v5->init(logfn));
+  ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
+  ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
+  ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+ncclResult_t ncclNetPluginInit() {
  char ncclNetPluginName[128];
  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
  if (envPluginName && strlen(envPluginName)) {
@@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
    } else {
      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
    }
-    return;
+    return ncclSuccess;
  }

-  *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-  if (*net == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
-    ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
-    if (ncclNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
-      if (netPluginLib != nullptr) dlclose(netPluginLib);
-      return;
+  ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+  if (ncclNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+    // Try v5 plugin
+    ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+    if (ncclNet_v5 == nullptr) {
+      ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+      if (ncclNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
+        if (netPluginLib != nullptr) dlclose(netPluginLib);
+        return ncclSuccess;
+      }
+      ncclNets[0] = &ncclNet_v4_as_v6;
+      ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v4_as_v6.name = ncclNet_v4->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
+    } else {
+      ncclNets[0] = &ncclNet_v5_as_v6;
+      ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
+      // Set the name right away to allow for NCCL_NET=... to work
+      ncclNet_v5_as_v6.name = ncclNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
    }
-    *net = &ncclNet_v4_as_v5;
-    ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
  }

  // Check for CollNet
-  *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-  if (*collnet == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
-    ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
-    if (ncclCollNet_v4 == nullptr) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+  ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+  if (ncclCollNets[0] == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+    ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+    if (ncclCollNet_v5 == nullptr) {
+      ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+      if (ncclCollNet_v4 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
+      } else {
+        ncclCollNets[0] = &ncclCollNet_v4_as_v6;
+        ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
+        ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
+      }
    } else {
-      *collnet = &ncclCollNet_v4_as_v5;
-      ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+      ncclCollNets[0] = &ncclCollNet_v5_as_v6;
+      ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
+      ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
    }
  }
-  return;
+  return ncclSuccess;
 }

-ncclResult_t ncclNetInit() {
-  // Always initialize bootstrap network
-  NCCLCHECK(bootstrapNetInit());
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}

+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
  // Initialize main communication network
-  ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-  ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
-  initPlugin(&nets[0], &collNets[0]);
  char* netName = getenv("NCCL_NET");
  bool ok = false;

  for (int i=0; i<3; i++) {
-    if (nets[i] == nullptr) continue;
-    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;

-    // net plugin is already initialized
-    int ndev;
-    if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
-    if (nets[i]->devices(&ndev) != ncclSuccess) continue;
-    if (ndev <= 0) continue;
-    ncclNet = nets[i];
+    comm->ncclNet = ncclNets[i];
    ok = true;

-    if (collNets[i]) {
-      do {
-        if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
-        if (collNets[i]->devices(&ndev) != ncclSuccess) break;
-        if (ndev <= 0) break;
-        ncclCollNet = collNets[i];
-      } while(0);
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
    }
    break;
  }
@@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() {
  return ncclSuccess;
 }

-ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
  constexpr int GPU_BUF_SIZE = 2*1024*1024;
 #if CUDART_VERSION >= 11030
  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
@@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
  }
 #endif
  int netDevs;
-  NCCLCHECK(ncclNetDevices(&netDevs));
+  NCCLCHECK(ncclNetDevices(comm, &netDevs));
  *gdrSupport = 0;
  for (int dev=0; dev<netDevs; dev++) {
    // Find a net device which is GDR-capable
    ncclNetProperties_t props;
-    NCCLCHECK(ncclNetGetProperties(dev, &props));
+    NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    *gdrSupport = 1;
@@ -232,34 +327,34 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
    void* mHandle = NULL;
    ncclResult_t ret;
    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
    while (sComm == NULL) {
-      NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+      NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
    }
    while (rComm == NULL) {
-      NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+      NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
    }
    CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
-    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+    if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
      *gdrSupport = 1;
    }
    ncclDebugNoWarn = 0;
    CUDACHECK(hipFree(gpuPtr));
 cleanup4:
-    NCCLCHECK(ncclNetCloseRecv(rComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, rComm));
 cleanup3:
-    NCCLCHECK(ncclNetCloseSend(sComm));
+    NCCLCHECK(ncclNetCloseSend(comm, sComm));
 cleanup2:
-    NCCLCHECK(ncclNetCloseListen(lComm));
+    NCCLCHECK(ncclNetCloseListen(comm, lComm));
 cleanup1:
    break;
  }
  return ncclSuccess;
 }

-int ncclNetVersion() {
-  return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+int ncclNetVersion(struct ncclComm* comm) {
+  return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
 }
@@ -14,6 +14,8 @@
 #define ENABLE_TIMER 0
 #include "timer.h"

+#include <sys/syscall.h>
+
 enum { proxyRecv=0, proxySend=1 };

 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
@@ -350,10 +352,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
  return ncclSuccess;
 }

-static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
  if (peer < 0) return ncclSuccess;

-  struct ncclPeer* peerComm = channel->peers+peer;
+  struct ncclChannelPeer* peerComm = channel->peers+peer;
  struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
  if (connector->transportComm == NULL) {
    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
@@ -362,35 +364,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
  }
  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;

-  NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  if (justInquire) *justInquire = true;
+  else {
+    NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
+  }
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  int pattern = op->pattern;
-  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
-    struct ncclRing* ring = &channel->ring;
-    if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex));
-    if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex));
-  }
-  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
-    // Tree up
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
-    // Tree down
-    struct ncclTree* tree = &channel->tree;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
-    NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
-  }
-  if (pattern == ncclPatternCollTreeUpDown) {
-    // CollTree up
-    NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1));  // For CollTree up, we are using push
-    // CollTree down
-    NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
+// justInquire != nullptr means don't actually do anything, just assertain need of
+// ncclProxySaveOp for this op.
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclChannel* channel = &comm->channels[op->channelId];
+  if (justInquire) *justInquire = false;
+  switch (op->pattern) {
+  case ncclPatternRing:
+  case ncclPatternRingTwice:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo: {
+      struct ncclRing* ring = &channel->ring;
+      if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
+      }
+      if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
+        NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex, justInquire));
+      }
+    } break;
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown: {
+      if (op->pattern != ncclPatternTreeDown) { // Tree up
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
+      }
+      if (op->pattern != ncclPatternTreeUp) { // Tree down
+        struct ncclTree* tree = &channel->tree;
+        for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
+          NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
+        }
+        NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
+      }
+    } break;
+  case ncclPatternCollTreeUpDown: {
+      // CollTree up
+      NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire));  // For CollTree up, we are using push
+      // CollTree down
+      NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
+    } break;
+  case ncclPatternSend:
+  case ncclPatternRecv: {
+      if (op->root == comm->rank) return ncclSuccess;
+      op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+      if (op->nsteps == 0) op->nsteps = 1;
+      NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
+    } break;
  }
  return ncclSuccess;
 }
@@ -406,26 +435,24 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
  op->chunkSteps = 1;
  op->protocol = NCCL_PROTO_SIMPLE;
  op->dtype = info->datatype;
-  op->connIndex = info->connIndex;

-  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
+  int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
  info->chunkSize = stepSize;
  op->root = info->root;
  op->nbytes = info->count;
-  if (info->root == -1) return ncclSuccess;
-
-  struct ncclPeer* peer = channel->peers + op->root;
+  struct ncclChannelPeer* peer = channel->peers + op->root;

  if (info->coll == ncclFuncSend) {
    op->pattern = ncclPatternSend;
-    if (op->root != info->comm->rank && peer->send[info->connIndex].transportComm && peer->send[info->connIndex].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
      // Tune chunk size for the network
      if (info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
    }
  } else if (info->coll == ncclFuncRecv) {
    op->pattern = ncclPatternRecv;
-    if (op->root != info->comm->rank && peer->recv[info->connIndex].transportComm && peer->recv[info->connIndex].transportComm->proxyProgress) {
+    if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
      // Tune chunk size for the network
      if (info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
@@ -441,22 +468,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
  return ncclSuccess;
 }

-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
-  struct ncclChannel* channel = comm->channels+op->channelId;
-  op->opCount = channel->workFifoTail-1;
-  if (op->root == comm->rank) return ncclSuccess;
-  if (op->pattern == ncclPatternRecv) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, op->connIndex));
-  } else if (op->pattern == ncclPatternSend) {
-    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
-    if (op->nsteps == 0) op->nsteps = 1;
-    NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, op->connIndex));
-  }
-  return ncclSuccess;
-}
-
 static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
  struct ncclProxyArgs* freeOp = *opPtr;
  struct ncclProxyArgs* next = freeOp->next;
@@ -598,8 +609,48 @@ void ncclDumpProxyState(int signal) {
  dumpProxyState(ncclLastProxyState);
 }

+NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
+ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
+#if CUDART_VERSION >= 11030
+  static int createThreadContext = -1;
+
+  if (createThreadContext == -1) {
+    createThreadContext = ncclParamCreateThreadContext();
+    if (createThreadContext) {
+      if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
+        WARN("Unable to create thread context due to old driver, disabling.");
+        createThreadContext = 0;
+      }
+    }
+  }
+  if (createThreadContext) {
+    if (comm->proxyState.cudaCtx == NULL) {
+      if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
+                                  CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
+        WARN("Failed to create CUDA context on device %d", comm->cudaDev);
+        createThreadContext = 0;
+        return ncclSuccess;
+      }
+    } else {
+      if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
+        WARN("Failed to set CUDA context on device %d", comm->cudaDev);
+        return ncclUnhandledCudaError;
+      }
+    }
+  }
+#endif
+  return ncclSuccess;
+}
+
 void* ncclProxyProgress(void *comm_) {
  struct ncclComm* comm = (struct ncclComm*)comm_;
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
+    WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
+  }
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
  state->nextOps = -1;
  signal(SIGUSR1, ncclDumpProxyState);
@@ -732,9 +783,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,

 static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  if (connection->send) {
-    NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
  } else {
-    NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+    NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
  }
  return ncclSuccess;
 }
@@ -778,7 +829,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
  NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
  NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
-  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
  // If we need proxy progress, map progress ops
  if (tcomm->proxyProgress) {
    char poolPath[] = "/dev/shm/nccl-XXXXXX";
@@ -885,7 +936,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
  NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
  connection->localRank = peer->localRank;
  NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
-  connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+  connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
  // If we need proxy progress, let's allocate ops and start the thread
  if (connection->tcomm->proxyProgress) {
    NCCLCHECK(proxyProgressInit(comm));
@@ -951,7 +1002,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p

 void* ncclProxyService(void* _args) {
  struct ncclComm* comm =  (struct ncclComm *) _args;
-  if (hipSetDevice(comm->cudaDev) != hipSuccess) {
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (ncclSetThreadContext(comm) != ncclSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
+  } else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
    WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
  }
  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
@@ -11,16 +11,11 @@
 #define ENABLE_TIMER 0
 #include "timer.h"

-extern struct ncclTransport p2pTransport;
-extern struct ncclTransport shmTransport;
-extern struct ncclTransport netTransport;
-extern struct ncclTransport collNetTransport;
-
-struct ncclTransport ncclTransports[NTRANSPORTS] = {
-  p2pTransport,
-  shmTransport,
-  netTransport,
-  collNetTransport
+struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+  &p2pTransport,
+  &shmTransport,
+  &netTransport,
+  &collNetTransport
 };

 template <int type>
@@ -37,10 +32,11 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  }
  bool xgmi;
  NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
+
  for (int t=0; t<NTRANSPORTS; t++) {
    if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
    if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
-    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransport *transport = ncclTransports[t];
    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
    int ret = 0;
    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@@ -55,18 +51,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  return ncclSystemError;
 }

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t mask = 1 << channel->id;
+  struct ncclChannel* channel = &comm->channels[channelId];
+  uint32_t mask = 1 << channelId;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
-    comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
+    comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
-    comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
+    comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
  }
  return ncclSuccess;
 }
@@ -82,17 +79,18 @@ void dumpData(struct ncclConnect* data, int ndata) {

 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  int highestType = TRANSPORT_P2P;  // track highest transport type
+
  hipStream_t transportSetupStream;
  CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
-  int highestType = TRANSPORT_P2P;  // track highest transport type

  struct ncclConnect data[2*MAXCHANNELS];
  for (int i=1; i<comm->nRanks; i++) {
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
    int sendPeer = (comm->rank + i) % comm->nRanks;
-    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
-    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
+    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];

    struct ncclConnect* recvData = data;
    int sendChannels = 0, recvChannels = 0;
@@ -137,7 +135,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
        NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
        conn->connected = 1;
-        CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
      }
    }
    TIME_STOP(3);
@@ -147,11 +146,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
        NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
        conn->connected = 1;
-        CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
+        CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
      }
    }
    TIME_STOP(4);
-    comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
+    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
  }
  CUDACHECK(hipStreamSynchronize(transportSetupStream));
  CUDACHECK(hipStreamDestroy(transportSetupStream));
@@ -179,10 +178,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  // check if we can connect to collnet, whose root is the nranks-th rank
  struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
  peerInfo->rank = nranks;
-  int support = 1;
-  if (isMaster) {
-    NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
-  }

  // send master receives connect info from peer recv master
  if (isMaster && type == collNetSend) {
@@ -192,14 +187,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  }

  // select
-  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers+nranks;
  // connector index: 0 for recv, 1 for send
  struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
  struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
  conn->transportComm = transportComm;
  // setup
  struct ncclConnect myConnect;
-  if (isMaster && support) {
+  if (isMaster) {
    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
  }
  // prepare connect handles
@@ -229,11 +224,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
  }
  // connect
-  if (isMaster && support) {
+  if (isMaster) {
    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
-    CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
+    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
+    CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
  }
  // recv side sends connect info to send side
  if (isMaster && type == collNetRecv) {
@@ -242,7 +237,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
    TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
  }
-  if (support) fail = 0;
+  fail = 0;
 cleanup:
  if (allConnects != NULL) free(allConnects);
  if (masterConnects != NULL) free(masterConnects);
@@ -271,7 +266,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
  // Free collNet resources
  for (int r=0; r<comm->nChannels; r++) {
    struct ncclChannel* channel = comm->channels+r;
-    struct ncclPeer* peer = channel->peers+comm->nRanks;
+    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      struct ncclConnector* send = peer->send + b;
      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
@@ -108,7 +108,7 @@ struct sendResources {
  uint64_t step;
  struct reqSlot (*reqFifo)[NCCL_STEPS];
  int collNetRank;
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+  volatile uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct recvResources {
@@ -128,12 +128,12 @@ struct recvResources {
  uint64_t step;
  struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
  int collNetRank;
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+  volatile uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

-/* Determine if we can communicate with the peer */
 static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  *ret = 1;
+  // This transport cannot be used for p2p
+  *ret = 0;
  return ncclSuccess;
 }

@@ -157,7 +157,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));

-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
  return ncclSuccess;
 }
@@ -175,7 +175,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));

-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
  return ncclSuccess;
 }
@@ -300,7 +300,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle
    comm->proxyState.progressState.collNet.resources = resources;
  }
  if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
+    NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
  return ncclSuccess;
 }

@@ -314,13 +314,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
      struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
      handlePtrs[i] = &(info->collNetHandle);
    }
-    ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
+    ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
          resources->collNetListenComms[netDev],
          resources->collNetComms+netDev);
    free(handlePtrs);
    if (ret == ncclSuccess) {
      // Close listen comm
-      NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
+      NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
    } else {
      resources->collNetListenComms[netDev] = NULL;
    }
@@ -334,7 +334,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
  resources->commRefCount[netDev]--;
  if (resources->commRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+    NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
  }
  for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
  comm->proxyState.progressState.collNet.resources = NULL;
@@ -450,9 +450,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);

-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+  }

  *((struct connectMap**)respBuff) = &resources->map;
  return ncclSuccess;
@@ -506,9 +519,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);

-  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
-        &resources->mhandles[NCCL_PROTO_SIMPLE]));
+#if CUDA_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && comm->dmaBufSupport) {
+    int dmabuf_fd;
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                 NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                 &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    (void)close(dmabuf_fd);
+  } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+  {
+    NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                           resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
+                           &resources->mhandles[NCCL_PROTO_SIMPLE]));
+  }

  // Pass info to send side
  info->reqFifo = resources->reqFifo;
@@ -524,7 +550,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->sendMhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@@ -541,7 +567,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->mhandles[p]) {
-      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+      NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@@ -621,9 +647,9 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
            args->idle = 0;
            //continue;
            // flush HDP if not done
-            if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
-              args->hdp_flushed = LOAD(recvTail);
-              STORE(resources->curr_hdp_reg, 1);
+            if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
+              args->hdp_flushed = *recvTail;
+              *resources->curr_hdp_reg = 1;
            }
          }
        }
@@ -634,10 +660,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
        if (reqFifo[group][buffSlot].recvBuff != NULL) {
          int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
-          int count = totalSize / ncclTypeSize(args->dtype);
+          int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
          reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
          char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
-          NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
+          NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
          if (sub->requests[buffSlot] == NULL) continue;

          TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@@ -653,7 +679,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int done, size;
        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
+        NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
        if (done) {
          TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@@ -744,7 +770,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
              int startChannel = group*COLLNET_GROUP_NSUBS;
              int offset;
              NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
-              NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+              NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
            }
          } else {
            for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -758,7 +784,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
        int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
+        if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
        if (done) {
          TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
          for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -8,15 +8,11 @@
 #include "comm.h"
 #include "net.h"
 #include "graph.h"
-#include <sys/time.h>
 #include "proxy.h"
 #include "collectives.h"
 #include "gdrwrap.h"
 #include "shm.h"
 #include "profiler.h"
-#if defined(ENABLE_NPKIT)
-#include "npkit/npkit.h"
-#endif
 #include "graph.h"
 #include "graph/topo.h"

@@ -108,7 +104,7 @@ struct sendResources {
  void* mhandles[NCCL_NUM_PROTOCOLS];
  uint64_t step;
  uint64_t llLastCleaning;
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+  volatile uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct recvResources {
@@ -136,7 +132,7 @@ struct recvResources {
  void* mhandles[NCCL_NUM_PROTOCOLS];
  uint64_t step;
  uint64_t llLastCleaning;
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+  volatile uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
@@ -178,7 +174,6 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
  req.channelId = channelId;
  req.connIndex = connIndex;
-  req.netDev = -1;
  req.curr_hdp_reg = 0;

  int proxyRank = myInfo->rank;
@@ -198,12 +193,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));

  if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d",
-        channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
  } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d",
-        channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
  }
  *((int*)connectInfo) = proxyRank;
@@ -222,7 +215,6 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
  req.channelId = channelId;
  req.connIndex = connIndex;
-  req.netDev = -1;

  // Use myInfo->rank as the receiver uses its own NIC
  int proxyRank = myInfo->rank;
@@ -238,8 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  req.remoteRank = peerInfo->rank;
  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));

-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d",
-      channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
  return ncclSuccess;
 }
@@ -448,7 +439,7 @@ static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, i
 static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
  int rank = comm->localRankToRank[connection->localRank];
  int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
-  NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
+  NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
  return ncclSuccess;
 }

@@ -470,7 +461,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
  resources->connIndex = req->connIndex;
  resources->curr_hdp_reg = req->curr_hdp_reg;
  ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
  resources->maxRecvs = props.maxRecvs;

  // We don't return any data
@@ -496,11 +487,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
  resources->channelId = req->channelId;
  resources->connIndex = req->connIndex;
  ncclNetProperties_t props;
-  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
  resources->maxRecvs = props.maxRecvs;

  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
  *done = 1;
  return ncclSuccess;
 }
@@ -527,15 +518,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
      resources->netSendComm = comms->sendComm[resources->channelId];
      if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+      NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  if (resources->netSendComm == NULL) {
@@ -609,7 +600,31 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#else
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
+        int dmabuf_fd;
+        uint64_t offset;
+        CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+        INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
+          (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
    }
  }

@@ -643,15 +658,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
      }
      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
      resources->netRecvComm = comms->recvComm[resources->channelId];
      if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
    } else {
-      NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+      NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
    }
  } else {
    // Connect to remote peer
-    NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
  if (resources->netRecvComm == NULL) {
@@ -659,7 +674,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
    return ncclSuccess;
  }
  *done = 1;
-  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+  NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));

  // Create structures
  struct connectMap* map = &resources->map;
@@ -714,7 +729,31 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+#if CUDA_VERSION >= 11070
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
+        int dmabuf_fd;
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#else
+      /* DMA-BUF support */
+      int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+      if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
+        int dmabuf_fd;
+        uint64_t offset;
+        CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
+        NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
+        (void)close(dmabuf_fd);
+        INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
+          (const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
+      } else // FALL-THROUGH to nv_peermem GDR path
+#endif
+      {
+        NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+      }
    }
  }

@@ -732,7 +771,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
  }
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@@ -748,12 +787,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
      comms->sendRefCount[resources->channelId]--;
-      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
    } else {
-      NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+      NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
    }
  } else {
-    NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+    NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
  }
  free(resources);
  return ncclSuccess;
@@ -767,7 +806,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
  }
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+      NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
    }
  }
  struct connectMapMem* mems = resources->map.mems;
@@ -779,12 +818,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
      comms->recvRefCount[resources->channelId]--;
-      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
    } else {
-      NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+      NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
    }
  } else {
-    NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+    NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
  }
  free(resources);
  return ncclSuccess;
@@ -792,16 +831,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct

 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");

-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-static int g_npkit_net_poll_cnt = 0;
-#endif
-
 static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-  g_npkit_net_poll_cnt++;
-#endif
-
  if (args->state == ncclProxyOpReady) {
    for (int s=0; s<args->nsubs; s++) {
      struct ncclProxySubArgs* sub = args->subs+s;
@@ -855,11 +885,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
          // We have something to receive, let's check if it's completely ready.
          int size = sizesFifo[buffSlot];
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
-          sub->npKitSizesFifo[buffSlot] = size;
-#endif
-
          char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
          int ready = 1;
          if (p == NCCL_PROTO_LL128) {
@@ -887,29 +912,13 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
          }
          if (ready) {
            // flush HDP if not done
-            if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
-              args->hdp_flushed = LOAD(recvTail);
-              STORE(resources->curr_hdp_reg, 1);
+            if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
+              args->hdp_flushed = *recvTail;
+              *resources->curr_hdp_reg = 1;
            }
            // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
            if (sub->requests[buffSlot] != NULL) {
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
-              NpKit::CollectCpuEvent(
-                  NPKIT_EVENT_NET_SEND_ENTRY,
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-                  g_npkit_net_poll_cnt,
-#else
-                  size,
-#endif
-                  uint64_t(sub->requests+buffSlot)/sizeof(void*),
-                  *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-              g_npkit_net_poll_cnt = 0;
-#endif
-#endif
-
              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
              sizesFifo[buffSlot] = -1;
              // Make sure size is reset to zero before we update the head.
@@ -926,24 +935,8 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
      if (sub->done < sub->transmitted) {
        int done;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+        NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
        if (done) {
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
-          NpKit::CollectCpuEvent(
-              NPKIT_EVENT_NET_SEND_EXIT,
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-              g_npkit_net_poll_cnt,
-#else
-              sub->npKitSizesFifo[buffSlot],
-#endif
-              uint64_t(sub->requests+buffSlot)/sizeof(void*),
-              *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-          g_npkit_net_poll_cnt = 0;
-#endif
-#endif
-
          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
          sub->done += args->sliceSteps;
          for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
@@ -969,11 +962,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
 }

 static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-  g_npkit_net_poll_cnt++;
-#endif
-
  if (args->state == ncclProxyOpReady) {
    // Initialize subs and group them by same recvComm.
    void* recvComm;
@@ -1051,26 +1039,10 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        uint64_t step = subGroup->posted;
        struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
        void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
        if (*requestPtr) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup+i;
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
-            NpKit::CollectCpuEvent(
-                NPKIT_EVENT_NET_RECV_ENTRY,
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-                g_npkit_net_poll_cnt,
-#else
-                sizes[i],
-#endif
-                uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
-                *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-            g_npkit_net_poll_cnt = 0;
-#endif
-#endif
-
            sub->posted += args->sliceSteps;
            for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
          }
@@ -1089,29 +1061,13 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        int sizes[NCCL_PROXY_MAX_SUBS];
        void* mhandles[NCCL_PROXY_MAX_SUBS];
        for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
-        NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
        if (done) {
          int useGdr = 0;
          int totalSize = 0;
          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
-
-#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
-            NpKit::CollectCpuEvent(
-                NPKIT_EVENT_NET_RECV_EXIT,
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-                g_npkit_net_poll_cnt,
-#else
-                sizes[i],
-#endif
-                uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
-                *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
-#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
-            g_npkit_net_poll_cnt = 0;
-#endif
-#endif
-
            sub->received += args->sliceSteps;
            for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
            if (step < sub->nsteps) {
@@ -1146,7 +1102,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
                }
              }
              struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
-              NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
+              NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
            }
          }
          args->idle = 0;
@@ -1161,7 +1117,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
        uint64_t step = subGroup->transmitted;
        int done = 1;
        void* request = subGroup->requests[step%NCCL_STEPS];
-        if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
+        if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
        if (done) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
@@ -296,6 +296,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
  return ncclSuccess;
 }

+// Detect whether DMA-BUF support is present in the kernel
+// Returns :
+// ncclSuccess : DMA-BUF support is available
+// ncclSystemError : DMA-BUF is not supported by the kernel
+ncclResult_t ncclIbDmaBufSupport(int dev) {
+  static int dmaBufSupported = -1;
+  if (dmaBufSupported == -1) {
+    ncclResult_t res;
+    struct ibv_pd* pd;
+    struct ibv_context* ctx;
+    ctx = ncclIbDevs[dev].context;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+    // Test kernel DMA-BUF support with a dummy call (fd=-1)
+    (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
+    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
+    dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  }
+  if (dmaBufSupported == 0) return ncclSystemError;
+  return ncclSuccess;
+failure:
+  dmaBufSupported = 0;
+  return ncclSystemError;
+}
+
 static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
  return ncclSuccess;
@@ -308,10 +333,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
  props->pciPath = ncclIbDevs[dev].pciPath;
  props->guid = ncclIbDevs[dev].guid;
  props->ptrSupport = NCCL_PTR_HOST;
-  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
-  } else {
-    props->ptrSupport |= NCCL_PTR_CUDA;
+  if (ncclIbGdrSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
+  }
+  if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
+    props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
  }
  props->speed = ncclIbDevs[dev].speed;
  props->latency = 0; // Not set
@@ -568,6 +594,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
  static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
  memset(handle, 0, sizeof(struct ncclIbHandle));
  comm->dev = dev;
+  comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
  NCCLCHECK(GetSocketAddr(&comm->sock.addr));
  if (ncclParamIbSockServerPortReuse()) {
    // reuse the socket address and fd for listen system call
@@ -614,7 +641,7 @@ ib_connect_check:
    /* expect user to call again */
    return ncclSuccess;
  } else if (conState == ncclSocketError) {
-    return ncclSystemError;
+    return ncclRemoteError;
  }

  // IB Setup
@@ -692,7 +719,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  stage->comm = rComm;
  stage->state = ncclIbCommStateAccept;
  lComm->sock.asyncFlag = 1;
-  rComm->sock.asyncFlag = 1;

 ib_accept:
  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
@@ -846,7 +872,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {

 ncclResult_t ncclIbTest(void* request, int* done, int* size);

-ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+/* DMA-BUF support */
+ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
  static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
  assert(size > 0);

@@ -856,7 +883,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
  uintptr_t addr = (uintptr_t)data & -pageSize;
-  int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
  ncclResult_t res;
  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
  for (int slot=0; /*true*/; slot++) {
@@ -868,14 +895,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
      // Deregister / register
      struct ibv_mr* mr;
      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
-      if (ncclIbRelaxedOrderingEnabled) {
-        // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
-        NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+      if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
+      if (fd != -1) {
+        /* DMA-BUF support */
+        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+      } else {
+        if (ncclIbRelaxedOrderingEnabled) {
+          // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
+        }
+        else {
+          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+        }
      }
-      else {
-        NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
-      }
-      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
+      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd);
      cache->population += 1;
      cache->slots[slot].addr = addr;
      cache->slots[slot].pages = pages;
@@ -897,6 +930,10 @@ returning:
  return res;
 }

+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
+}
+
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
@@ -950,13 +987,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {

  // Write size as immediate data. In the case of multi-send, only write
  // 0 or 1 as size to indicate whether there was data sent or received.
-  uint64_t immData = 0;
+  uint32_t immData = 0;
  if (nreqs == 1) {
    immData = reqs[0]->send.size;
  } else {
-    uint8_t* multiImmData = (uint8_t*)&immData;
+    if (nreqs > 32) {
+      WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
+      return ncclInternalError;
+    }
    for (int r=0; r<nreqs; r++) {
-      multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+      immData |= (reqs[r]->send.size ? 1 : 0) << r;
    }
  }

@@ -1231,7 +1271,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
        char line[SOCKET_NAME_MAXLEN+1];
        WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
             ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
-        return ncclSystemError;
+        return ncclRemoteError;
      }

      struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
@@ -1246,9 +1286,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
          if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
          if (req->nreqs > 1) {
            // In the case of a multi recv, we only set sizes to 0 or 1.
-            uint8_t* sizes = (uint8_t*)&wc->imm_data;
            for (int i=0; i<req->nreqs; i++) {
-              req->recv.sizes[i] |= sizes[i];
+              req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
            }
          } else {
            req->recv.sizes[0] += wc->imm_data;
@@ -1309,6 +1348,7 @@ ncclNet_t ncclNetIb = {
  ncclIbConnect,
  ncclIbAccept,
  ncclIbRegMr,
+  ncclIbRegMrDmaBuf,
  ncclIbDeregMr,
  ncclIbIsend,
  ncclIbIrecv,
@@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
  struct ncclSocketListenComm* comm;
  NCCLCHECK(ncclSocketNewListenComm(&comm));
  NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+  comm->sock.asyncFlag = 1;
  NCCLCHECK(ncclSocketListen(&comm->sock));
  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
  NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
@@ -359,7 +360,7 @@ socket_connect_check:
      /* expect user to call again */
      return ncclSuccess;
    } else if (conState == ncclSocketError) {
-      return ncclSystemError;
+      return ncclRemoteError;
    }
    stage->state = ncclSocketCommStateSend;

@@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = {
  ncclSocketConnect,
  ncclSocketAccept,
  ncclSocketRegMr,
+  NULL, // No DMA-BUF support
  ncclSocketDeregMr,
  ncclSocketIsend,
  ncclSocketIrecv,
@@ -8,6 +8,7 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
+#include "shm.h"
 #include "graph.h"
 #include "graph/topo.h"

@@ -20,6 +21,34 @@ struct p2pConnectInfo {
  int rank;
  int read;
  struct ncclP2pBuff p2pBuff;
+  // Use by CE memcpy
+  char shmName[7];
+  int shmSize;
+};
+static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
+
+struct p2pShm {
+  struct ncclSendMem sendMem;
+  struct ncclRecvMem recvMem;
+};
+struct p2pProxyInfo {
+  // Shared memory between proxy and receiving GPU
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  char shmName[7];
+  int shmSize;
+
+  // Intermediate step for sender
+  struct ncclRecvMem* ceRecvMem;
+  char* ceDevBuff;
+
+  // Receiver buffer
+  char* recvFifo;
+
+  // Used by progress only
+  uint64_t step;
+  hipStream_t stream;
+  hipEvent_t events[NCCL_STEPS];
 };
 static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");

@@ -28,18 +57,22 @@ struct p2pSendResources {
  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
  void* sendMemIpc;
  void* recvMemIpc;
+  struct p2pProxyInfo proxyInfo;
 };

 struct p2pRecvResources {
  struct ncclRecvMem* devMem;
  void* sendMemIpc;
  void* recvMemIpc;
+  struct p2pShm* shm;
+  struct p2pShm* devShm;
+  int shmSize;
 };

 #include <sys/types.h>

 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
-int busIdToCudaDev(int64_t busId) {
+static int busIdToCudaDev(int64_t busId) {
  int ndev;
  if (hipGetDeviceCount(&ndev) != hipSuccess)
    return -1;
@@ -55,8 +88,13 @@ int busIdToCudaDev(int64_t busId) {
  return -1;
 }

+NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
+static int useMemcpy = 0;
+static void initCeOperation();
+
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  initCeOperation();
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  if (!info1->hasFineGrain || !info2->hasFineGrain)  {
    *ret = 0;
@@ -74,7 +112,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  int intermediateRank;
  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
  if (*ret == 0) return ncclSuccess;
-  if (intermediateRank != -1) return ncclSuccess;
+  if (intermediateRank != -1) {
+    if (useMemcpy) *ret = 0;
+    return ncclSuccess;
+  }

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -94,7 +135,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  int p2p;
  if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
-	 cudaDev1, info1->busId, cudaDev2, info2->busId);
+         cudaDev1, info1->busId, cudaDev2, info2->busId);
    *ret = 0;
    return ncclSuccess;
  }
@@ -188,6 +229,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  send->transportResources = resources;
  int useRead, intermediateRank;
  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  if (useMemcpy) useRead = 0;

  resources->next_hdp_reg = 0;
  bool isXGMI;
@@ -214,14 +256,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
      if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
    } else {
      send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
-          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
+      INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
    }
  } else {
    info->rank = intermediateRank;
@@ -231,9 +273,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  }

  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
-  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  if (useMemcpy) {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
+    info->shmSize = resources->proxyInfo.shmSize;
+    memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
+  } else {
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
+  }

-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
  return ncclSuccess;
 }

@@ -259,7 +307,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
      if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
@@ -287,31 +335,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
      /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
+      if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
      send->conn.buffs[p] = (char*)(resources->devMem+1);
    } else {
      send->conn.buffs[p] = buff;
      buff += send->comm->buffSizes[p];
    }
  }
-  send->conn.tail = &remDevMem->tail;
-  send->conn.head = &resources->devMem->head;
-  send->conn.ptrExchange = &resources->devMem->ptrExchange;
-  send->conn.next_hdp_reg = resources->next_hdp_reg;
-  send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+
+  if (useMemcpy) {
+    send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
+    send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
+    // Send SIMPLE buff to proxy, and replace it by local buffer
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
+  } else {
+    send->conn.tail = &remDevMem->tail;
+    send->conn.head = &resources->devMem->head;
+    send->conn.ptrExchange = &resources->devMem->ptrExchange;
+    send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
+  }
  return ncclSuccess;
 }

 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
-  struct ncclSendMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;

-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+  struct ncclSendMem* remDevMem = NULL;
+
+  if (useMemcpy) {
+    char shmPath[PATH_MAX];
+    sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
+    TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+    resources->shmSize = info->shmSize;
+    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
+    // Remove the file to ensure proper clean-up
+    NCCLCHECK(ncclShmUnlink(shmPath));
+
+    recv->conn.tail = &resources->devShm->recvMem.tail;
+    recv->conn.head = &resources->devShm->sendMem.head;
+  } else {
+    NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+
+    recv->conn.tail = &resources->devMem->tail;
+    recv->conn.head = &remDevMem->head;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+    recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
+  }

  char* buff = (char*)(resources->devMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
+      if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
      /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
      recv->conn.buffs[p] = (char*)(remDevMem+1);
    } else {
@@ -319,10 +397,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
      buff += recv->comm->buffSizes[p];
    }
  }
-  recv->conn.tail = &resources->devMem->tail;
-  recv->conn.head = &remDevMem->head;
-  recv->conn.ptrExchange = &remDevMem->ptrExchange;
-  recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
  return ncclSuccess;
 }

@@ -338,11 +412,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
  if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
  if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
+  if (useMemcpy) {
+    NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+  }
  free(resources);
  return ncclSuccess;
 }

-static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo;
+    NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+    connection->transportResources = proxyInfo;
+
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], true));
+
+    char shmPath[PATH_MAX];
+    shmPath[0] = '\0';
+    proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
+    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
+    TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
+    memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
+
+    NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+
+    if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
+    memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
+  } else {
+    if (reqSize != sizeof(int)) return ncclInternalError;
+    int size = *((int*)reqBuff);
+    if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+    struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+    NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, true));
+    connection->transportResources = p2pBuff->directPtr;
+    hipError_t res = hipIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+    if (res != hipSuccess) {
+      WARN("hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
+      hipFree(p2pBuff->directPtr);
+      free(p2pBuff);
+      CUDACHECK(res);
+    }
+  }
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
  if (reqSize != sizeof(int)) return ncclInternalError;
  int size = *((int*)reqBuff);
  if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
@@ -360,15 +475,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct
  return ncclSuccess;
 }

-static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+
+  if (reqSize != sizeof(void*)) return ncclInternalError;
+  proxyInfo->recvFifo = *((char**)reqBuff);
+
+  CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(hipEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  if (useMemcpy) {
+    struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
+    NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
+    NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
+    CUDACHECK(hipFree(proxyInfo->ceDevBuff));
+    CUDACHECK(hipStreamDestroy(proxyInfo->stream));
+    for (int i=0; i<NCCL_STEPS; i++) {
+      CUDACHECK(hipEventDestroy(proxyInfo->events[i]));
+    }
+    free(proxyInfo);
+  } else {
+    // Do not check return code as CUDA may have already shut down
+    hipFree(connection->transportResources);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  // Do not check return code as CUDA may have already shut down
  hipFree(connection->transportResources);
  return ncclSuccess;
 }

+static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses hipMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(hipMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, hipMemcpyDeviceToDevice, resources->stream));
+          CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        hipError_t res = hipEventQuery(resources->events[buffSlot]);
+        if (res != hipErrorNotReady) CUDACHECK(res);
+        if (res == hipSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->shm->recvMem.tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport p2pTransport = {
  "P2P",
  p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpy = ncclParamP2pUseCudaMemcpy();
+    if (useMemcpy) {
+      p2pTransport.send.proxyConnect = p2pSendProxyConnect;
+      p2pTransport.send.proxyProgress = p2pSendProxyProgress;
+    }
+    init = 1;
+  }
+}
@@ -31,11 +31,21 @@ struct shmRecvResources {
  struct ncclRecvMem* devHostMem;
 };

+#define SHM_SEND_SIDE 1
+#define SHM_RECV_SIDE 2
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0);
+NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both
+static int useMemcpySend = 0;
+static int useMemcpyRecv = 0;
+NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size
+static int shmLocality = 0;
+static void initCeOperation();

 /* Determine two peers can communicate with SHM */
-ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  *ret = 0;
+  initCeOperation();

  if (ncclParamShmDisable() == 1) return ncclSuccess;

@@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #define MAX_SHM_NAME_LEN 1024

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
  struct shmSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
@@ -65,17 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  char shmPath[PATH_MAX];
  shmPath[0] = '\0';
-  info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  int shmSize = sizeof(struct ncclSendMem);
+  if (shmLocality == SHM_SEND_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
+  }
+  info->shmSize = resources->shmSize = shmSize;
  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));

-  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d",
-      channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
+  INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct", comm, comm->nRanks);
  return ncclSuccess;
 }

-ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
  struct shmRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
@@ -86,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  char shmPath[PATH_MAX];
  shmPath[0] = '\0';
  int shmSize = sizeof(struct ncclRecvMem);
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  if (shmLocality == SHM_RECV_SIDE) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+  }
  info->shmSize = resources->shmSize = shmSize;
  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
@@ -95,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  return ncclSuccess;
 }

+struct shmProxyInfo {
+  struct ncclRecvMem* ceRecvMem;
+  char* devFifo;
+  char* shmFifo;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
+
+  // used by progress only
+  uint64_t step;
+  hipStream_t stream;
+  hipEvent_t events[NCCL_STEPS];
+};
+
 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -109,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
  // Remove the file to ensure proper clean-up
  NCCLCHECK(ncclShmUnlink(shmPath));

-  send->transportResources = resources;
-  int offset = 0;
+  char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
-    offset += send->comm->buffSizes[p];
+    send->conn.buffs[p] = buff;
+    buff += send->comm->buffSizes[p];
  }
  send->conn.tail = &resources->devRemHostMem->tail;
-
  send->conn.head = &resources->devHostMem->head;
+
+  if (useMemcpyRecv) {
+    send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
+  }
+  if (useMemcpySend) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
+    NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    send->conn.tail = &proxyInfo.ceRecvMem->tail;
+    send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
+  }
  return ncclSuccess;
 }

-ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -132,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
  NCCLCHECK(ncclShmUnlink(shmPath));
-  recv->conn.head = &resources->devRemHostMem->head;

-  int offset = 0;
+  char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
-    offset += recv->comm->buffSizes[p];
+    recv->conn.buffs[p] = buff;
+    buff += recv->comm->buffSizes[p];
  }
+  recv->conn.head = &resources->devRemHostMem->head;
  recv->conn.tail = &resources->devHostMem->tail;
+
+  if (useMemcpyRecv) {
+    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
+    struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
+    NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
+    recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
+    recv->conn.tail = &proxyInfo.ceRecvMem->tail;
+  }
  return ncclSuccess;
 }

-ncclResult_t shmSendFree(struct ncclConnector* send) {
+static ncclResult_t shmSendFree(struct ncclConnector* send) {
  struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -151,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) {
  return ncclSuccess;
 }

-ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -159,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) {
  return ncclSuccess;
 }

+static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(hipEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmProxyInfo* proxyInfo;
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(proxyInfo, reqBuff, reqSize);
+  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
+  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
+  CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(hipEventCreate(proxyInfo->events+i));
+  }
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->transportResources = proxyInfo;
+  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
+  memcpy(respBuff, proxyInfo, respSize);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(hipStreamDestroy(resources->stream));
+  CUDACHECK(hipFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(hipEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
+  CUDACHECK(hipStreamDestroy(resources->stream));
+  CUDACHECK(hipFree(resources->devFifo));
+  NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+  for (int i=0; i<NCCL_STEPS; i++) {
+    CUDACHECK(hipEventDestroy(resources->events[i]));
+  }
+  free(connection->transportResources);
+  return ncclSuccess;
+}
+
+static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
+        // Check GPU has sent everything
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(hipMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, hipMemcpyDeviceToHost, resources->stream));
+          CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
+          resources->recvMem->sizesFifo[buffSlot] = size;
+          __sync_synchronize(); // make sure sizesFifo is visible
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        hipError_t res = hipEventQuery(resources->events[buffSlot]);
+        if (res != hipErrorNotReady) CUDACHECK(res);
+        if (res == hipSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify SHM
+          resources->recvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      // Round to next multiple of sliceSteps
+      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      sub->posted = sub->transmitted = sub->done = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  args->idle = 1;
+  if (args->state == ncclProxyOpProgress) {
+    int p = args->protocol;
+    int stepSize = comm->buffSizes[p] / NCCL_STEPS;
+    for (int s=0; s<args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs+s;
+      struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
+      if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+          continue;
+      }
+      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
+        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+        volatile int* sizesFifo = resources->recvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->recvMem->tail;
+        // Check data is ready in SHM
+        if ((*recvTail > sub->base+sub->transmitted)) {
+          int size = sizesFifo[buffSlot];
+          CUDACHECK(hipMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, hipMemcpyHostToDevice, resources->stream));
+          CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
+          sub->transmitted += args->sliceSteps;
+        }
+      }
+      if (sub->done < sub->transmitted) {
+        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
+        hipError_t res = hipEventQuery(resources->events[buffSlot]);
+        if (res != hipErrorNotReady) CUDACHECK(res);
+        if (res == hipSuccess) {
+          sub->done += args->sliceSteps;
+          // Notify GPU
+          resources->ceRecvMem->tail = sub->base + sub->done;
+        }
+        if (sub->done == sub->nsteps) {
+          resources->step = sub->base + sub->nsteps;
+          args->done++;
+        }
+      }
+    }
+    if (args->done == args->nsubs) {
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
 struct ncclTransport shmTransport = {
  "SHM",
  shmCanConnect,
  { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
+
+static void initCeOperation() {
+  static int init = 0;
+  if (!init) {
+    useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1);
+    useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
+    if (useMemcpySend) {
+      shmTransport.send.proxyConnect = shmSendProxyConnect;
+      shmTransport.send.proxyFree = shmSendProxyFree;
+      shmTransport.send.proxyProgress = shmSendProxyProgress;
+    }
+    if (useMemcpyRecv) {
+      shmTransport.recv.proxyConnect = shmRecvProxyConnect;
+      shmTransport.recv.proxyFree = shmRecvProxyFree;
+      shmTransport.recv.proxyProgress = shmRecvProxyProgress;
+    }
+    shmLocality = ncclParamShmLocality();
+    if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) {
+      WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)");
+      shmLocality = SHM_RECV_SIDE;
+    }
+    init = 1;
+  }
+}
@@ -6,7 +6,7 @@ endif
 HIPCC = $(HIP_PATH)/bin/hipcc

 EXE = topo_expl
-CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/rocm_smi/include/ -DTOPO_EXPL -DENABLE_TRACE
+CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE

 files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/misc/param.cc \
 	../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc
@@ -27,6 +27,7 @@ struct allGather3Data_t{
  struct ncclGraphInfo ring;
  struct ncclGraphInfo collNet;
  struct ncclTopoRanks topoRanks;
+  bool pivotA2AEnabled;
 };

 void initCollNet();
@@ -179,10 +179,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));

  if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  }
  *((int*)connectInfo) = proxyRank;
@@ -205,7 +205,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));

-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  return ncclSuccess;
 }
@@ -250,8 +250,9 @@ struct ncclTransport collNetTransport = {
  { collNetRecvSetup, NULL, NULL, NULL }
 };

-struct ncclTransport ncclTransports[NTRANSPORTS] = {
-  p2pTransport,
-  shmTransport,
-  netTransport,
+struct ncclTransport* ncclTransports[] = {
+  &p2pTransport,
+  &shmTransport,
+  &netTransport,
+  &collNetTransport,
 };
@@ -49,6 +49,8 @@ THE SOFTWARE.
 #include "graph.h"

 NodeModel *node_model;
+extern ncclNet_t* ncclNet;
+

 char* getCmdOption(char ** begin, char ** end, const std::string & option) {
    char ** itr = std::find(begin, end, option);
@@ -216,14 +218,12 @@ int main(int argc,char* argv[])
    comm[i].nRanks = nranks;
    NCCLCHECK(ncclCalloc(&comm[i].connectSend, NCCL_MAX_CONNS*comm->nRanks));
    NCCLCHECK(ncclCalloc(&comm[i].connectRecv, NCCL_MAX_CONNS*comm->nRanks));
-    comm[i].p2pSendCount = comm[i].p2pRecvCount = 0;
-    NCCLCHECK(ncclCalloc(&comm[i].p2pSends, comm->nRanks));
-    NCCLCHECK(ncclCalloc(&comm[i].p2pRecvs, comm->nRanks));
    node_model = network.GetNode(i);
    assert(node_model!=0);
    comm[i].busId = node_model->getGpuBusId(i);
    comm[i].topo = node_model->getSystem(i);
    comm[i].peerInfo = peerInfo;
+    comm[i].ncclNet = ncclNet;
    // Mark channels as non initialized.
    for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
    NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
@@ -272,8 +272,6 @@ int main(int argc,char* argv[])
  for (int i = 0; i < nranks; i++) {
    free(comm[i].connectSend);
    free(comm[i].connectRecv);
-    free(comm[i].p2pSends);
-    free(comm[i].p2pRecvs);
  }

  free(treeGraph);
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -216,20 +216,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
  struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
                                                  comm->channels[channelId].peers[peer].recv + connIndex;
-
  // handle intra-node network connections
  int n1 = -1, n2 = -1;
  if (connIndex == NCCL_CONN_IDX_P2P_NET) {
    NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, comm->rank, graph, channelId, (type == 1) ? 1 : 0, &n1));
    NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
  }
-
  bool xgmi;
  NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
+
  for (int t=0; t<NTRANSPORTS; t++) {
    if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
    if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
-    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransport *transport = ncclTransports[t];
    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
    int ret = 0;
    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@@ -244,18 +243,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
  return ncclSystemError;
 }

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t mask = 1 << channel->id;
+  struct ncclChannel* channel = &comm->channels[channelId];
+  uint32_t mask = 1 << channelId;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
-    comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
+    comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
-    comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
+    comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
  }
  return ncclSuccess;
 }
@@ -271,17 +271,18 @@ void dumpData(struct ncclConnect* data, int ndata) {

 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  int highestType = TRANSPORT_P2P;  // track highest transport type
+
  //hipStream_t transportSetupStream;
  //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
-  int highestType = TRANSPORT_P2P;  // track highest transport type

  struct ncclConnect data[2*MAXCHANNELS];
  for (int i=1; i<comm->nRanks; i++) {
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
    int sendPeer = (comm->rank + i) % comm->nRanks;
-    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
-    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
+    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];

    struct ncclConnect* recvData = data;
    int sendChannels = 0, recvChannels = 0;
@@ -319,7 +320,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
        //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
        conn->connected = 1;
-        //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
+        //CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
      }
    }
    for (int c=0; c<MAXCHANNELS; c++) {
@@ -327,10 +328,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
        //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
        conn->connected = 1;
-        //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
+        //CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
      }
    }
-    comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
+    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
  }
  //CUDACHECK(hipStreamSynchronize(transportSetupStream));
  //CUDACHECK(hipStreamDestroy(transportSetupStream));
@@ -357,10 +358,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  // check if we can connect to collnet, whose root is the nranks-th rank
  struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
  peerInfo->rank = nranks;
-  int support = 1;
-  if (isMaster) {
-    NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
-  }

  // send master receives connect info from peer recv master
  if (isMaster && type == collNetSend) {
@@ -370,14 +367,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
  }

  // select
-  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclChannelPeer* root = channel->peers+nranks;
  // connector index: 0 for recv, 1 for send
  struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
  struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
  conn->transportComm = transportComm;
  // setup
  struct ncclConnect myConnect;
-  if (isMaster && support) {
+  if (isMaster) {
    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
  }
  // prepare connect handles
@@ -407,11 +404,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    //if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
  }
  // connect
-  if (isMaster && support) {
+  if (isMaster) {
    //NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
-    struct ncclPeer* devRoot = channel->devPeers+nranks;
-    struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
-    //CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
+    struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
+    struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
+    //CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
  }
  // recv side sends connect info to send side
  if (isMaster && type == collNetRecv) {
@@ -420,7 +417,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
    TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
  }
-  if (support) fail = 0;
+  fail = 0;
 cleanup:
  if (allConnects != NULL) free(allConnects);
  if (masterConnects != NULL) free(masterConnects);
@@ -449,21 +446,24 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
  // Free collNet resources
  for (int r=0; r<comm->nChannels; r++) {
    struct ncclChannel* channel = comm->channels+r;
-    struct ncclPeer* peer = channel->peers+comm->nRanks;
+    struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      struct ncclConnector* send = peer->send + b;
-      //if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+      //if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
      send->transportResources = NULL; // avoid double free
    }
    for (int b=0; b<NCCL_MAX_CONNS; b++) {
      struct ncclConnector* recv = peer->recv + b;
-      //if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+      //if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
      recv->transportResources = NULL; // avoid double free
    }
  }
  return ncclSuccess;
 }

+RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
+RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
+
 ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
  // We use 2 AllGathers
@@ -499,12 +499,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
  comm->topo->nRanks = comm->nRanks;
  // init netGdrLevel
  comm->topo->netGdrLevel = -2;
+  // init Pivot A2A related fields
+  comm->topo->pivotA2AEnabled = false;
+  comm->topo->pivotA2ANumBiRings = 0;
  // Compute paths between GPUs and NICs
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
  // Remove inaccessible GPUs and unused NICs
  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
  // Recompute paths after trimming
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
  // Init search
  NCCLCHECK(ncclTopoSearchInit(comm->topo));
  // Print final topology
@@ -571,39 +574,31 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
    }
  }

-#if 0
-  { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
-    CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
-    if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)
-    {
-      if (hasPeerAccess)
-      {
-        if (intraProcRanks == nranks)
-          cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
-        else
-          cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
-      }
-
-      // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
-      if (!allXgmi && !rcclParamCliqueIgnoreTopo())
-      {
-        INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
-        cliqueMode = CliqueManager::CLIQUE_DISABLED;
-      }
-    }
-    comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
-    NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
-  } // [/RCCL]
-#endif
-
  if (comm->rank == ncclParamGraphDumpFileRank()) {
    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
  }

  // Determine local CollNet support before all-gather
-  if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
+  if (collNetSupport(comm)) {
+    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    if (collNetEnable != NULL) {
+      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
+      if (strcmp(collNetEnable, "1") == 0) {
+        comm->collNetSupport = 1;
+      }
+    }
+  }
+  if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;

+  if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
+    if (rcclParamP2pNetDisable() == 0) {
+      if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1;
+      INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
+    }
+    else
+      INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
+  }
  // AllGather3 - begin
 #if 0
  struct ncclGraphInfo {
@@ -624,6 +619,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
    struct ncclGraphInfo ring;
    struct ncclGraphInfo collNet;
    struct ncclTopoRanks topoRanks;
+    bool pivotA2AEnabled;
  } *allGather3Data;

  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
@@ -666,6 +662,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
  allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
  allGather3Data[rank].collNetSupport = comm->collNetSupport;
+  allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();

  comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
    ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
@@ -758,6 +755,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
    comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
+    comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
  }

  comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
@@ -818,16 +816,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    struct ncclChannel* channel = comm->channels+c;
    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
  }
  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
-  if (ringGraph.nIntraChannels) {
+  if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
    comm->useIntraNet = 1;
    // Connect NET for intranode use
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channel = comm->channels+c;
      if (comm->nRanks == 1) continue;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
  }
@@ -838,8 +836,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
  }
  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
  INFO(NCCL_INIT, "Connected all trees");
@@ -861,7 +859,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
      for (int h=0; h<nHeads; h++) {
        const int head = heads[h];
        collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
-        if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
+        collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
      }
      // Verify CollNet setup across ranks after trying the first channel
      if (c == 0) {
@@ -876,12 +874,12 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    int highestTransportType0, highestTransportType1;
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channelRecv = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);

@@ -919,6 +917,52 @@ collnet_cleanup:
  // Compute nChannels per peer for p2p
  NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 #if 0
+  do { // Setup p2p structures in comm->tasks
+    struct ncclTasks* tasks = &comm->tasks;
+    int nRanks = comm->nRanks;
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
+    int localRank = comm->localRank;
+    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
+    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
+    int s=0, r=0;
+    // schedule delta 0, +1, -1, +2, -2, ...
+    // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
+    for (int d=0; d <= nNodes/4; d++) {
+      int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
+      int index = 0;
+      int delta = deltas[index];
+    sched_delta:
+      int recvNode = (node+nNodes-delta)%nNodes;
+      int sendNode = (node+delta)%nNodes;
+      int steps = comm->maxLocalRanks;
+      for (int step=0; step < steps; step++) {
+        int recvIndex = (localRank-step+steps)%steps;
+        if (recvIndex < nodeRanks[recvNode].localRanks) {
+          tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
+          r++;
+        }
+        int sendIndex = (localRank+step)%steps;
+        if (sendIndex < nodeRanks[sendNode].localRanks) {
+          tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
+          s++;
+        }
+      }
+      index++;
+      if (index == 1 && deltas[1] == deltas[0]) index++;
+      if (index == 2 && deltas[2] == deltas[0]) index++;
+      if (index == 3 && deltas[3] == deltas[2]) index++;
+      if (index == 3 && deltas[3] == deltas[1]) index++;
+      if (index < 4) {
+        delta = deltas[index];
+        goto sched_delta;
+      }
+    }
+    assert(s == nRanks && r == nRanks);
+  } while (0);
+
  if (ncclParamNvbPreconnect()) {
    // Connect p2p when using NVB path
    int nvbNpeers;
@@ -926,18 +970,17 @@ collnet_cleanup:
    NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
    for (int r=0; r<nvbNpeers; r++) {
      int peer = nvbPeers[r];
-      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      int channelId;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectRecv[peer] |= (1<<channelId);
+        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
+          comm->connectSend[peer] |= (1<<channelId);
        }
      }
-      delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectSend[peer] |= (1<<channelId);
+        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
+          comm->connectRecv[peer] |= (1<<channelId);
        }
      }
    }
@@ -947,18 +990,17 @@ collnet_cleanup:
 #endif
  // Connect to local net proxy
  struct ncclProxyConnector proxyConn;
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
  //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
  //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));

  // Then to remote ones when using PXN
-  if (ncclPxnDisable() == 0) {
+  if (ncclPxnDisable(comm) == 0) {
    int nranks;
    int* pxnPeers;
    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
    for (int r=0; r<nranks; r++) {
      //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
-      //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+     // NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
    }
    free(pxnPeers);
  }
@@ -973,6 +1015,10 @@ collnet_cleanup:
        if (intraProcRanks == 0) intraProcRank0 = i;
        if (i == rank) intraProcRank = intraProcRanks;
        intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
      }
    }
    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
@@ -983,14 +1029,40 @@ collnet_cleanup:
          intraProcRank, intraProcRanks, intraProcRank0);
      return ncclInternalError;
    }
-    //NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
  } while(0);

+#if 0
+  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+    char* str = getenv("NCCL_LAUNCH_MODE");
+    enum ncclLaunchMode mode, modeOld;
+    if (str && strcasecmp(str, "GROUP") == 0) {
+      mode = ncclLaunchModeGroup;
+    } else {
+      mode = ncclLaunchModeParallel;
+    }
+    // In theory we could be racing with other communicators not associated with
+    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+    }
+  }
+
  /* Local intra-node barrier */
  //NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));

  // Unlink proxy shm to make sure it will be properly cleaned up.
-  //NCCLCHECK(ncclProxyShmUnlink(comm));
+  NCCLCHECK(ncclProxyShmUnlink(comm));
+#endif

  // We should have allocated all buffers, collective fifos, ... we can
  // restore the affinity.
@@ -1013,3 +1085,7 @@ ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* d
 ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *bw) {
  return ncclSuccess;
 }
+
+int ncclNetVersion(struct ncclComm* comm) {
+  return 4;
+}